In [1]:
# import libraries
from tensorflow.keras.models import load_model
import pandas as pd
import glob
import numpy as np 

np.random.seed(305)

In [2]:
load = pd.read_csv('employment_cube_june19.TXT', low_memory=False)
load.shape

(2123146, 19)

In [3]:
load.head()

Unnamed: 0,AGYSUB,LOC,AGELVL,EDLVL,GSEGRD,LOSLVL,OCC,PATCO,PPGRD,SALLVL,STEMOCC,SUPERVIS,TOA,WORKSCH,WORKSTAT,DATECODE,EMPLOYMENT,SALARY,LOS
0,AA00,11,D,15,14.0,C,905,1,GS-14,L,XXXX,8,30,F,1,201906,1,125005.0,4.8
1,AA00,11,E,14,14.0,D,905,1,GS-14,K,XXXX,8,30,F,1,201906,1,117191.0,8.5
2,AA00,11,I,18,14.0,A,905,1,GS-14,K,XXXX,8,40,F,2,201906,1,117191.0,0.3
3,AA00,11,E,13,,F,340,2,ES-**,Q,XXXX,2,50,F,1,201906,1,176900.0,16.0
4,AA00,11,D,15,14.0,D,905,1,GS-14,K,XXXX,8,30,F,1,201906,1,117191.0,7.8


In [4]:
def load_emp_fact(path):

    df_total = pd.read_csv(path ,dtype={ 'AGYSUB': str, 'LOC': str, 'AGELVL': str, 'EDLVL': str, 'GSEGRD': str, 'LOSLVL': str, 'OCC': str, 'PATCO': int,
   'PPGRD': str, 'SALLVL': str, 'STEMOCC': str, 'SUPERVIS': str, 'TOA': str, 'WORKSCH': str, 'WORKSTAT': str,
   'DATECODE': str, 'EMPLOYMENT': str, 'SALARY': str, 'LOS': float})    

    #clean salary col cast to integer
    df_total['SALARY'] = df_total['SALARY'].str.replace('$','')
    df_total['SALARY'] = df_total['SALARY'].str.replace(',','')

    df_total = df_total.dropna(axis = 0, how = 'any')

    df_total['SALARY'] = df_total['SALARY'].astype(int)
    
    df_total.drop(['EDLVL', 'EMPLOYMENT', 'STEMOCC', 'SUPERVIS', 'WORKSTAT'], axis = 1 ,inplace = True)
    
    return df_total

In [5]:
cube_clean = load_emp_fact('employment_cube_june19.TXT')
cube_clean.head()

Unnamed: 0,AGYSUB,LOC,AGELVL,GSEGRD,LOSLVL,OCC,PATCO,PPGRD,SALLVL,TOA,WORKSCH,DATECODE,SALARY,LOS
0,AA00,11,D,14,C,905,1,GS-14,L,30,F,201906,125005,4.8
1,AA00,11,E,14,D,905,1,GS-14,K,30,F,201906,117191,8.5
2,AA00,11,I,14,A,905,1,GS-14,K,40,F,201906,117191,0.3
4,AA00,11,D,14,D,905,1,GS-14,K,30,F,201906,117191,7.8
6,AA00,11,E,15,D,905,1,GS-15,P,30,F,201906,165417,9.8


In [6]:
refData = 'C:/Users/604070/Desktop/Files/Performance/OPM_HR_Analytics/Model_Development/Data/REF_Tables/*'

def load_dim_table():

    '''load reference tables
    
    Output: returns a list containing dataframes of each reference table '''

    ref_list = []

    for name in glob.glob(refData):

        ref_list.append(pd.read_csv(name))

    return ref_list  


def join_func( df, df_other):
    
    '''Input: two inputs... df a given dataframe ... df_other a given dataframe to be joined to df
       
       Output: Returns a dataframe containing containing both input dataframes '''

    df = df.merge(df_other)

    return df


def joins(df):
    
    '''Inputs: df- dataframe to join reference tables; data - flag used to specify which refrence tables to join
    
       Output: Returns dataframe joined to reference tables'''
    
    #indexing based on number and order of reference tables found in the directory

    dim = load_dim_table()

    acc = dim[0]
    age = dim[1]
    agency = dim[2]
    education = dim[4]
    date = dim[5]
    grade = dim[6][1:]
    location = dim[7]
    los = dim[8]
    occ = dim[9]
    patco = dim[10]
    paygroup = dim[11]
    salary = dim[12]
    separation = dim[13]
    stem = dim[14]
    supervisor = dim[15]  
    toa = dim[16]
    workstat = dim[17]
    worksch = dim[18]

    # load fact table 

    df_join = df
    
    df_join = join_func(df_join, patco)

    df_join = join_func(df_join, age)

    df_join = join_func(df_join, agency)    

    df_join = join_func(df_join, location)

    df_join = join_func(df_join, los)

    df_join = join_func(df_join, paygroup)

    df_join = join_func(df_join, salary)

    df_join = join_func(df_join, toa)

    df_join = join_func(df_join, worksch)

    df_join = join_func(df_join, occ)
    
    x = df_join.loc[:,('AGELVLT', 'AGYSUBT', 'AGYT', 'AGYTYPT','GSEGRD','LOCT',
       'LOCTYPT', 'LOSLVLT', 'OCCFAMT', 'OCCT' , 'PATCOT', 'PAYPLANT', 'PPGRD' ,'PPGROUPT', 'QTRT',
        'SALLVLT','TOAT', 'TOATYPT', 'WORKSCHT', 'WSTYPT', 'Year', 'Month')]

    return x


In [7]:
df = joins(cube_clean)
df.shape

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return self._getitem_lowerdim(tup)


(1259087, 22)

In [8]:
def df_time(length, distr):
    
    rn = np.arange(1,13)
    uni = [1/12]*12
    hist = [0.11296113266468223,
     0.10026245351832906,
     0.09717002656371118,
     0.08925156128731015,
     0.08850775807969499,
     0.08713398925006596,
     0.07744573540768568,
     0.07673328577237595,
     0.07551580244162327,
     0.0719665780566477,
     0.06507940412777015,
     0.05797227283010367]

    if distr == 'uniform':
        prob = uni
    else:
        prob = hist
    
    month = pd.DataFrame(np.random.choice(rn, length, replace=True, p = prob))
    
    return month

In [9]:
def time_features(rows, distr):
    
    time_df = pd.DataFrame()
    
    mon_abv = {1: "JAN", 2: "FEB", 3: "MAR", 4: "APR",
               5: "MAY", 6: "JUN", 7: "JUL", 8: "AUG", 
               9: "SEP", 10: "OCT", 11: "NOV", 12: "DEC"}

    qtr_abv = {"JAN": 'JAN-MAR', "FEB": 'JAN-MAR', "MAR": 'JAN-MAR',
               'APR': 'APR-JUN', 'MAY': 'APR-JUN', 'JUN': 'APR-JUN', 
               'JUL': 'JUL-SEP', 'AUG': 'JUL-SEP', 'SEP': 'JUL-SEP', 
               'OCT': 'OCT-DEC', 'NOV': 'OCT-DEC', 'DEC': 'OCT-DEC'}
    
    time_index = df_time(rows, distr)
    
    time_df['Month_rp'] = time_index.iloc[:,0].map(mon_abv)
    
    time_df['Year_rp'] = '2017'
    
    time_df['QTRT_rp'] = time_df.iloc[:,0].map(qtr_abv)
    
    return time_df    

In [10]:
df_t_uniform = time_features(df.shape[0], 'uniform')

In [11]:
df_t_historical = time_features(df.shape[0], 'historical')

In [12]:
df_t_uniform['Month_rp'].value_counts()

JAN    105387
NOV    105382
AUG    105090
MAR    105047
JUL    105047
DEC    104972
APR    104970
MAY    104807
JUN    104795
SEP    104742
OCT    104671
FEB    104177
Name: Month_rp, dtype: int64

In [13]:
df_t_historical['Month_rp'].value_counts()

JAN    142761
FEB    126486
MAR    122240
APR    112345
MAY    111293
JUN    109680
JUL     97909
AUG     96761
SEP     94825
OCT     90353
NOV     81613
DEC     72821
Name: Month_rp, dtype: int64

In [14]:
def impute(cube, time_cube):
    
    cube['Month'] = time_cube['Month_rp']
    cube['Year'] = time_cube['Year_rp']
    cube['QTRT'] = time_cube['QTRT_rp']
    
    return cube    

In [15]:
df_imputed_uniform = impute(df, df_t_uniform)

In [16]:
df_imputed_uniform.Month.value_counts()

JAN    105387
NOV    105382
AUG    105090
MAR    105047
JUL    105047
DEC    104972
APR    104970
MAY    104807
JUN    104795
SEP    104742
OCT    104671
FEB    104177
Name: Month, dtype: int64

In [17]:
def base_structure():
    
    data = pd.read_csv('C:/Users/604070/Desktop/Files/Performance/OPM_HR_Analytics/Dataset/opm_raw.txt', low_memory=False)
    
    data['Year'] = data['EFDATET'].apply(lambda x: x[4:])
    data['Month'] = data['EFDATET'].apply(lambda x: x[:3])
    data['QTRT'] = data['QTRT'].apply(lambda x: x[:7])
    data['GSEGRD'] = data['GSEGRD'].astype(str)
    
    data = data[['AGELVLT', 'AGYSUBT', 'AGYT', 'AGYTYPT',
       'GSEGRD', 'LOCT', 'LOCTYPT', 'LOSLVLT', 'OCCFAMT', 'OCCT', 'PATCOT', 'PAYPLANT', 'PPGRD', 'PPGROUPT', 'QTRT',
       'SALLVLT', 'TOAT', 'TOATYPT', 'WORKSCHT', 'WSTYPT','Year', 'Month']]
    
    return data

In [18]:
df_truth = base_structure()

In [19]:
def dummy_data(pred_data, base):
    
    df_combo = pd.concat([pred_data, base], axis = 0)
    
    df_dummy = pd.get_dummies(df_combo)
    
    df_dummy = df_dummy.iloc[:pred_data.shape[0],:]
       
    train_dummy = df_dummy.drop(columns = ['LOCT_MG-MONGOLIA',
    'LOCT_NG-NIGER',
    'GSEGRD_03',
    'LOCT_SL-SIERRA LEONE',
    'GSEGRD_02',
    'LOCT_AU-AUSTRIA',
    'LOCT_UV-BURKINA FASO',
    'LOCT_CE-SRI LANKA',
    'GSEGRD_04',
    'AGYT_GE-BARRY GOLDWATER SCHOLARSHIP AND EXCELLENCE IN EDUCATION FOUNDATION',
    'GSEGRD_09',
    'LOCT_MT-MALTA',
    'LOCT_TD-TRINIDAD AND TOBAGO',
    'LOCT_BM-BURMA',
    'GSEGRD_08',
    'LOCT_HR-CROATIA',
    'AGYSUBT_CM53-BUREAU OF ECONOMIC ANALYSIS',
    'LOCT_RW-RWANDA',
    'GSEGRD_05',
    'LOCT_NZ-NEW ZEALAND',
    'LOCT_NP-NEPAL',
    'LOCT_CM-CAMEROON',
    'LOCT_GV-GUINEA',
    'LOCT_GB-GABON',
    'GSEGRD_07',
    'AGYSUBT_GE00-BARRY GOLDWATER SCHOLARSHIP AND EXCELLENCE IN EDUCATION FOUNDATION',
    'GSEGRD_01',
    'LOCT_LH-LITHUANIA',
    'LOCT_DA-DENMARK',
    'GSEGRD_06'], axis = 1)

    return train_dummy    

In [20]:
df_dummy_uniform = dummy_data(df_imputed_uniform, df_truth)

In [21]:
df_dummy_uniform.Month_DEC.value_counts()

0    1154115
1     104972
Name: Month_DEC, dtype: int64

In [22]:
# models
loaded_model = load_model('saved_model.h5')

In [23]:
def predict_api(model, data, confidence):
    
    predictions = model.predict(data)
    
    predictions[:,1] = np.where(predictions[:,1] < confidence, 0 , predictions[:,1])
    
    count = predictions.argmax(axis =1).sum()
           
    return count, data.shape[0]

In [24]:
quits_u, pop = predict_api(loaded_model, df_dummy_uniform, .82)

In [25]:
quits_u/pop

0.037419971773197565

In [26]:
conf = .82

In [27]:
df_result_uniform = pd.DataFrame(np.zeros((len(df_dummy_uniform.columns), 3)), columns = ['Category', 'Sample_size', 'Predictions_Uniform'])

for i, col in enumerate(df_dummy_uniform.columns): 
     
    try:
        df_result_uniform.iloc[i, 0] = col 
    
        pred_data = df_dummy_uniform[df_dummy_uniform[col]==1]

        quit, sample = predict_api(loaded_model, pred_data, conf)
        
        df_result_uniform.iloc[i,1] = sample
        df_result_uniform.iloc[i,2] = quit
        #print(i)
    
    except:
        print(i,'   ',col, "No entries in the dataset")
    

11     AGELVLT_Unspecified No entries in the dataset
13     AGYSUBT_AB**-UNSPECIFIED No entries in the dataset
47     AGYSUBT_AF2Q-HEADQUARTERS, AIR FORCE WEATHER AGENCY No entries in the dataset
55     AGYSUBT_AF3T-AIR FORCE ELEMENTS, U.S. TRANSPORTATION COMMAND No entries in the dataset
61     AGYSUBT_AF5K-AIR FORCE PETROLEUM AGENCY No entries in the dataset
83     AGYSUBT_AG36-GRAIN INSPECTION, PACKERS AND STOCKYARDS ADMINISTRATION No entries in the dataset
119     AGYSUBT_ARG6-U.S. ARMY NETWORK ENTERPRISE TECHNOLOGY COMMAND/9TH ARMY SIGNAL COMMAND No entries in the dataset
139     AGYSUBT_ARX1-U.S. ARMY MATERIAL COMMAND No entries in the dataset
175     AGYSUBT_CM65-ECONOMICS AND STATISTICS ADMINISTRATION No entries in the dataset
202     AGYSUBT_DD68-DEPARTMENT OF DEFENSE TEST RESOURCE MANAGEMENT CENTER No entries in the dataset
207     AGYSUBT_DD81-DEFENSE ACQUISITION UNIVERSITY No entries in the dataset
249     AGYSUBT_EDEB-OFFICE OF THE DUTY SECRETARY OF EDUCATION No entries in

In [28]:
df_result_uniform.to_csv('uniform_predictions')

In [29]:
# historical predictions 
df_imputed_historical = impute(df, df_t_historical)

In [30]:
df_dummy_historical = dummy_data(df_imputed_historical, df_truth)

In [31]:
df_dummy_historical.Month_DEC.value_counts()

0    1186266
1      72821
Name: Month_DEC, dtype: int64

In [32]:
df_result_historical = pd.DataFrame(np.zeros((len(df_dummy_historical.columns), 3)), columns = ['Category', 'Sample_size', 'Predictions_Historical'])

for i, col in enumerate(df_dummy_historical.columns): 
     
    try:
        df_result_historical.iloc[i, 0] = col 
    
        pred_data = df_dummy_historical[df_dummy_historical[col]==1]

        quit, sample = predict_api(loaded_model, pred_data, conf)
        
        df_result_historical.iloc[i,1] = sample
        df_result_historical.iloc[i,2] = quit
        #print(i)
    
    except:
        print(i,'   ',col, "No entries in the dataset")

11     AGELVLT_Unspecified No entries in the dataset
13     AGYSUBT_AB**-UNSPECIFIED No entries in the dataset
47     AGYSUBT_AF2Q-HEADQUARTERS, AIR FORCE WEATHER AGENCY No entries in the dataset
55     AGYSUBT_AF3T-AIR FORCE ELEMENTS, U.S. TRANSPORTATION COMMAND No entries in the dataset
61     AGYSUBT_AF5K-AIR FORCE PETROLEUM AGENCY No entries in the dataset
83     AGYSUBT_AG36-GRAIN INSPECTION, PACKERS AND STOCKYARDS ADMINISTRATION No entries in the dataset
119     AGYSUBT_ARG6-U.S. ARMY NETWORK ENTERPRISE TECHNOLOGY COMMAND/9TH ARMY SIGNAL COMMAND No entries in the dataset
139     AGYSUBT_ARX1-U.S. ARMY MATERIAL COMMAND No entries in the dataset
175     AGYSUBT_CM65-ECONOMICS AND STATISTICS ADMINISTRATION No entries in the dataset
202     AGYSUBT_DD68-DEPARTMENT OF DEFENSE TEST RESOURCE MANAGEMENT CENTER No entries in the dataset
207     AGYSUBT_DD81-DEFENSE ACQUISITION UNIVERSITY No entries in the dataset
249     AGYSUBT_EDEB-OFFICE OF THE DUTY SECRETARY OF EDUCATION No entries in

In [33]:
df_result_historical.to_csv('historical_predictions')

In [34]:
df_uni = pd.read_csv('uniform_predictions')
df_hist = pd.read_csv('historical_predictions')

In [35]:
df_uni = df_uni.iloc[:,1:]
df_hist = df_hist.iloc[:,1:]

In [36]:
df_uni.head()

Unnamed: 0,Category,Sample_size,Predictions_Uniform
0,AGELVLT_20-24,24738.0,3088.0
1,AGELVLT_25-29,62577.0,7077.0
2,AGELVLT_30-34,117116.0,10520.0
3,AGELVLT_35-39,158801.0,10259.0
4,AGELVLT_40-44,149072.0,6441.0


In [37]:
df_hist.head()

Unnamed: 0,Category,Sample_size,Predictions_Historical
0,AGELVLT_20-24,24738.0,3134.0
1,AGELVLT_25-29,62577.0,7208.0
2,AGELVLT_30-34,117116.0,10778.0
3,AGELVLT_35-39,158801.0,10254.0
4,AGELVLT_40-44,149072.0,6531.0


In [38]:
df_final = df_uni.join(df_hist, rsuffix = 'historical')

In [39]:
df_final

Unnamed: 0,Category,Sample_size,Predictions_Uniform,Categoryhistorical,Sample_sizehistorical,Predictions_Historical
0,AGELVLT_20-24,24738.0,3088.0,AGELVLT_20-24,24738.0,3134.0
1,AGELVLT_25-29,62577.0,7077.0,AGELVLT_25-29,62577.0,7208.0
2,AGELVLT_30-34,117116.0,10520.0,AGELVLT_30-34,117116.0,10778.0
3,AGELVLT_35-39,158801.0,10259.0,AGELVLT_35-39,158801.0,10254.0
4,AGELVLT_40-44,149072.0,6441.0,AGELVLT_40-44,149072.0,6531.0
5,AGELVLT_45-49,164902.0,3383.0,AGELVLT_45-49,164902.0,3458.0
6,AGELVLT_50-54,193709.0,2060.0,AGELVLT_50-54,193709.0,2070.0
7,AGELVLT_55-59,196591.0,1427.0,AGELVLT_55-59,196591.0,1468.0
8,AGELVLT_60-64,121792.0,1101.0,AGELVLT_60-64,121792.0,1119.0
9,AGELVLT_65 or more,67264.0,1586.0,AGELVLT_65 or more,67264.0,1617.0


In [40]:
df_final.columns

Index(['Category', 'Sample_size', 'Predictions_Uniform', 'Categoryhistorical',
       'Sample_sizehistorical', 'Predictions_Historical'],
      dtype='object')

In [41]:
df_final.sort_values(['Category'], ascending = [True]).head()

Unnamed: 0,Category,Sample_size,Predictions_Uniform,Categoryhistorical,Sample_sizehistorical,Predictions_Historical
0,AGELVLT_20-24,24738.0,3088.0,AGELVLT_20-24,24738.0,3134.0
1,AGELVLT_25-29,62577.0,7077.0,AGELVLT_25-29,62577.0,7208.0
2,AGELVLT_30-34,117116.0,10520.0,AGELVLT_30-34,117116.0,10778.0
3,AGELVLT_35-39,158801.0,10259.0,AGELVLT_35-39,158801.0,10254.0
4,AGELVLT_40-44,149072.0,6441.0,AGELVLT_40-44,149072.0,6531.0


In [42]:
df_final = df_final[['Category', 'Predictions_Historical', 'Predictions_Uniform', 'Sample_size']]

In [43]:
df_final.head()

Unnamed: 0,Category,Predictions_Historical,Predictions_Uniform,Sample_size
0,AGELVLT_20-24,3134.0,3088.0,24738.0
1,AGELVLT_25-29,7208.0,7077.0,62577.0
2,AGELVLT_30-34,10778.0,10520.0,117116.0
3,AGELVLT_35-39,10254.0,10259.0,158801.0
4,AGELVLT_40-44,6531.0,6441.0,149072.0


In [44]:
df_final = df_final[df_final['Sample_size'] > 0]

In [45]:
df_final = df_final.set_index('Category')

In [46]:
df_final.drop(['Year_2017', 
           'Month_APR', 
           'Month_AUG', 
           'Month_DEC', 
           'Month_FEB',
           'Month_JAN', 
           'Month_JUL',
           'Month_JUN', 
           'Month_MAR', 
           'Month_MAY', 
           'Month_NOV', 
           'Month_OCT', 
           'Month_SEP', 
           'QTRT_JAN-MAR', 
           'QTRT_APR-JUN',
           'QTRT_JUL-SEP',
           'GSEGRD_10',
           'GSEGRD_11',
           'GSEGRD_12',
           'GSEGRD_13',
           'GSEGRD_14',
           'GSEGRD_15',
            'QTRT_OCT-DEC'], axis = 0, inplace=True)

In [47]:
df_final.sort_values(['Sample_size'], ascending = False, inplace =True)
df_final.shape

(1318, 3)

In [48]:
df_final

Unnamed: 0_level_0,Predictions_Historical,Predictions_Uniform,Sample_size
Category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
LOCTYPT_United States,47248.0,46550.0,1232092.0
PPGROUPT_Standard GSEG Pay Plans,46186.0,45490.0,1226980.0
PAYPLANT_GS-GENERAL SCHEDULE,45903.0,45229.0,1220423.0
WSTYPT_Full-time,37506.0,36942.0,1213694.0
WORKSCHT_F - Full-time Nonseasonal,32353.0,31862.0,1193506.0
TOATYPT_Permanent,44489.0,43977.0,1181056.0
AGYTYPT_Cabinet Level Agencies,42856.0,42345.0,1120488.0
TOAT_10-Competitive Service - Career,17559.0,17559.0,805457.0
PATCOT_Administrative,7564.0,7202.0,525416.0
PATCOT_Professional,17532.0,17660.0,323387.0


In [49]:
df_final.to_csv('publish_version_4_30_20.csv')

In [50]:
sum(df_final.Predictions_Historical) / sum(df_final.Sample_size) 

0.03796804655020438