In [1]:
import glob
import pandas as pd
import numpy as np
import re
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

#directory of reference tables 

empCube = 'C:/Users/604070/Desktop/Files/Performance/OPM_HR_Analytics/Model_Development/Data/EmpCube/FACTDATA_MAR201?.TXT'
refData = 'C:/Users/604070/Desktop/Files/Performance/OPM_HR_Analytics/Model_Development/Data/REF_Tables/*'

#accensions and seperations source files

accData = 'C:/Users/604070/Desktop/Files/Performance/OPM_HR_Analytics/Model_Development/Data/ACC/ACCDATA_FY2011-2017.TXT'
sepData = 'C:/Users/604070/Desktop/Files/Performance/OPM_HR_Analytics/Model_Development/Data/SEP/SEPDATA_FY2011-2017.TXT'


def load_emp_fact():

    '''Employment cube load'''

    df_total = pd.DataFrame()

    for name in glob.glob(empCube):


        df = pd.read_csv(name,dtype={ 'AGYSUB': str, 'LOC': str, 'AGELVL': str, 'EDLVL': str, 'GSEGRD': str, 'LOSLVL': str, 'OCC': str, 'PATCO': str,
       'PPGRD': str, 'SALLVL': str, 'STEMOCC': str, 'SUPERVIS': str, 'TOA': str, 'WORKSCH': str, 'WORKSTAT': str,
       'DATECODE': str, 'EMPLOYMENT': str, 'SALARY': str, 'LOS': float})    
        df_total = pd.concat([df_total, df])

    #clean salary col cast to integer
    df_total['SALARY'] = df_total['SALARY'].str.replace('$','')
    df_total['SALARY'] = df_total['SALARY'].str.replace(',','')

    df_total = df_total.dropna(axis = 0, how = 'any')

    df_total['SALARY'] = df_total['SALARY'].astype(int)
    
    df_total.drop(['DATECODE','EDLVL', 'EMPLOYMENT', 'STEMOCC', 'SUPERVIS', 'WORKSTAT'], axis = 1 ,inplace = True)

    return df_total


def load_acc_fact():

    '''Load accessions data'''

    print('Loading data...')

    df = pd.read_csv(accData, low_memory=False)

    df = df.dropna(axis = 0, how = 'any')

    df['SALARY'] = df['SALARY'].astype(int)
    
    return df    

def load_sep_fact():

    '''Load separations data'''

    df = pd.read_csv(sepData, low_memory=False)

    df = df.dropna(axis = 0, how = 'any')

    df['SALARY'] = df['SALARY'].astype(int)
    
    df = df[df['OCC'] != '****']

    df = df[df['TOA'] != '**']

    return df




def load_dim_table():

    '''load reference tables'''

    ref_list = []

    for name in glob.glob(refData):

        ref_list.append(pd.read_csv(name))

    return ref_list  



def join_func( df, df_other):

    df = df.merge(df_other)

    return df


def joins(df, data):

    '''indexing based on number and order of reference tables found in the directory '''

    dim = load_dim_table()

    acc = dim[0]
    age = dim[1]
    agency = dim[2]
    education = dim[4]
    date = dim[5]
    grade = dim[6][1:]
    location = dim[7]
    los = dim[8]
    occ = dim[9]
    patco = dim[10]
    paygroup = dim[11]
    salary = dim[12]
    separation = dim[13]
    stem = dim[14]
    supervisor = dim[15]  
    toa = dim[16]
    workstat = dim[17]
    worksch = dim[18]

    # load fact table 

    df_join = df

    '''reference tables joined to acc/sep fact tables'''  

    if (data == 'accessions'):

        df_join = join_func(df_join, acc)
    else:

        df_join = join_func(df_join, separation)


    df_join = join_func(df_join, age)

    df_join = join_func(df_join, agency)

    df_join = join_func(df_join, date)

    df_join = join_func(df_join, location)

    df_join = join_func(df_join, los)

    df_join = join_func(df_join, patco)

    df_join = join_func(df_join, paygroup)

    df_join = join_func(df_join, salary)

    df_join = join_func(df_join, toa)

    df_join = join_func(df_join, worksch)

    df_join = join_func(df_join, occ)

    return df_join



def join_dim_tables():   

    '''reduce star schema to table'''


    df_joins_acc = joins(load_acc_fact(), 'accessions')

    df_joins_sep = joins(load_sep_fact(), 'separations')


    return df_joins_acc, df_joins_sep  


def raw_data():
    acc, sep = join_dim_tables()

    rawData = pd.concat([acc, sep], axis = 0, sort = True)

    rawData['YEAR'] = rawData['EFDATE'].apply(lambda x : x//100)

    twoYear_data = rawData.loc[(rawData['YEAR']>= 2015)]

    X = twoYear_data.loc[:,('AGELVLT', 'AGYSUBT', 'AGYT', 'AGYTYPT', 'LOCT',
       'LOCTYPT', 'LOS', 'LOSLVLT', 'OCCFAMT','PATCOT', 'PAYPLANT', 'PPGROUPT',
        'PPTYPT', 'QTR', 'SALARY', 'SALLVLT','TOAT', 'TOATYPT', 'WORKSCHT', 'WSTYPT')]


    rawDatalabel = twoYear_data.loc[:,('ACCT','SEPT')]

    rawDatalabel['ACCT'].update(rawDatalabel.pop('SEPT'))  
    
    binaryLabel = rawDatalabel['ACCT'].apply(lambda x: 1 if x =='Quit' else 0)

    df_binary = pd.DataFrame(binaryLabel)

    rawDatalabel['ACCT'] = rawDatalabel['ACCT'].apply(lambda x: 'QUIT' if x =='Quit' else 'notQuit')      

    df_label = pd.DataFrame(rawDatalabel['ACCT'] + twoYear_data['YEAR'].astype(str))

    def labeling(x):
        if x == 'QUIT2015':
            return 't+2'
        elif x == 'QUIT2016':
            return 't+1'
        elif x == 'QUIT2017':
            return 't'
        else:
            return 'Not_Quitting'

    y = df_label[0].apply(labeling)
    
    return X, y, df_binary
    

def dummies_func(X,y):
    
    y_encoded = pd.get_dummies(y)

    X_encoded = pd.get_dummies(X)
    
    return y_encoded, X_encoded 

    


def correl(df,features):

    '''Join features with label data and produce correlation matrix'''

    print("Loading correlation matrix...")

    X = df

    df = X[features] 

    df = pd.concat([df,y], axis = 1)  

    df_dummy = pd.get_dummies(df)

    matrix = df_dummy.corr()

    fig, ax = plt.subplots(figsize=(20,20))         

    obj = sns.heatmap(matrix, linewidths=.1, xticklabels=True, yticklabels=True, ax=ax)

    plt.show(obj)

    return obj


def model_build(X_encoded, y_encoded):

    print("Loading model...")

    X_enc = X_encoded

    y_enc = y_encoded  

    X_train, X_test, y_train, y_test = train_test_split(X_enc, y_enc, test_size = .3, random_state = 305)

    model = DecisionTreeClassifier()

    model.fit(X_train, y_train)

    y_predict_train = model.predict(X_train)

    y_predict_test = model.predict(X_test)

    training_score = accuracy_score(y_predict_train , y_train)

    testing_score = model.score(X_test,y_test)


    return  testing_score

def quit_correlation():
    
    a, b, c = raw_data()
    
    a_dummy = pd.get_dummies(a)
    
    c = c.rename(columns = {"ACCT": "Quit"})
    
    feature_list = list(a_dummy.columns)

    corr_dict =  {}

    c = c.rename(columns = {"ACCT": "Quit"})

    for feature in feature_list:

        c[feature] = a_dummy[feature]

        matrix = c.corr()

        feature_corr = matrix.iloc[0,1]

        corr_dict.update(({feature:feature_corr}))

        c.drop([feature], axis = 1, inplace = True)
        
    corr_table = pd.DataFrame.from_dict(data = corr_dict, orient = 'index')
    
    corr_table.sort_values(0, ascending = False, inplace = True)
        
    return corr_table


In [2]:
df_quit = quit_correlation()

Loading data...


In [3]:
df_quit.head()

Unnamed: 0,0
LOSLVLT_1 - 2 years,0.169499
LOSLVLT_5 - 9 years,0.105137
LOSLVLT_3 - 4 years,0.102344
WSTYPT_Not Full-time,0.082198
QTR,0.067091


In [4]:
df_quit.to_excel("quit_dataframe.xlsx")