In [71]:
#uncomment if need to install dependencies

#!pip install pandas
#!pip install plotly
#!pip install plotly.express
#!pip install dash

In [107]:
# load dependencies

import pandas as pd
import numpy as np
#import plotly.express as px
import plotly.graph_objects as go
import plotly.io as io
#io.renderers.default = 'browser' #to ensure prints as expect
import dash
from dash import dcc, html
from dash.dependencies import Input, Output
import plotly.express as px


In [73]:
# read file
filepath = '../data/raw/HRDataset_v14.xls.txt'
hr_df = pd.read_csv(filepath, sep='\t',  encoding='utf-16')

In [74]:
# check data
hr_df.columns

# continuous variable EDA
print(hr_df.dtypes) #data type
hr_df.describe() #continuous descriptives
len(hr_df.Employee_Name.unique()) #number of unique ids
hr_df.info



Employee_Name                  object
EmpID                           int64
MarriedID                       int64
MaritalStatusID                 int64
GenderID                        int64
EmpStatusID                     int64
DeptID                          int64
PerfScoreID                     int64
FromDiversityJobFairID          int64
Salary                          int64
Termd                           int64
PositionID                      int64
Position                       object
State                          object
Zip                             int64
DOB                            object
Sex                            object
MaritalDesc                    object
CitizenDesc                    object
HispanicLatino                 object
RaceDesc                       object
DateofHire                     object
DateofTermination              object
TermReason                     object
EmploymentStatus               object
Department                     object
ManagerName 

<bound method DataFrame.info of                 Employee_Name  EmpID  MarriedID  MaritalStatusID  GenderID  \
0         Adinolfi, Wilson  K  10026          0                0         1   
1    Ait Sidi, Karthikeyan     10084          1                1         1   
2           Akinkuolie, Sarah  10196          1                1         0   
3                Alagbe,Trina  10088          1                1         0   
4            Anderson, Carol   10069          0                2         0   
..                        ...    ...        ...              ...       ...   
306            Woodson, Jason  10135          0                0         1   
307        Ybarra, Catherine   10301          0                0         0   
308          Zamora, Jennifer  10010          0                0         0   
309               Zhou, Julia  10043          0                0         0   
310             Zima, Colleen  10271          0                4         0   

     EmpStatusID  DeptID  PerfS

In [75]:
# convert data to correct pandas type. 

# convert EmpID, ManagerID, deptid to string.
    # married, martial status, gender, emp status, fromdiv jobfair, recruitmentsource to unordered factor.
to_string = ['EmpID', 'MarriedID', 'DeptID', 'MarriedID', 'MaritalStatusID', 'GenderID', 'FromDiversityJobFairID', 'PositionID', 'ManagerID']
for var in to_string:
    hr_df[var] = hr_df[var].astype(str)

# performance score, satisfaction as ordered factor.
to_cat_ord = ['PerfScoreID', 'EmpSatisfaction']
for var in to_cat_ord:
    hr_df[var] = pd.Categorical(hr_df[var].astype(str), categories = ['1','2','3','4','5'], ordered = True)

# lastperformReview_date, DateofHire, DateofTermination as date object. 
to_date = ['LastPerformanceReview_Date', 'DateofHire', 'DateofTermination']
for var in to_date:
    hr_df[var] = pd.to_datetime(hr_df[var], format = '%m/%d/%Y')

# note: dayslate and abscences are counts.


In [101]:
# filter data for typical data cleaning
hr_df
#hr_df[hr_df['Position'] != 'President & CEO'] # CIO

Unnamed: 0,Employee_Name,EmpID,MarriedID,MaritalStatusID,GenderID,EmpStatusID,DeptID,PerfScoreID,FromDiversityJobFairID,Salary,...,ManagerName,ManagerID,RecruitmentSource,PerformanceScore,EngagementSurvey,EmpSatisfaction,SpecialProjectsCount,LastPerformanceReview_Date,DaysLateLast30,Absences
0,"Adinolfi, Wilson K",10026,0,0,1,1,5,4,0,62506,...,Michael Albert,22.0,LinkedIn,Exceeds,4.60,5,0,2019-01-17,0,1
1,"Ait Sidi, Karthikeyan",10084,1,1,1,5,3,3,0,104437,...,Simon Roup,4.0,Indeed,Fully Meets,4.96,3,6,2016-02-24,0,17
2,"Akinkuolie, Sarah",10196,1,1,0,5,5,3,0,64955,...,Kissy Sullivan,20.0,LinkedIn,Fully Meets,3.02,3,0,2012-05-15,0,3
3,"Alagbe,Trina",10088,1,1,0,1,5,3,0,64991,...,Elijiah Gray,16.0,Indeed,Fully Meets,4.84,5,0,2019-01-03,0,15
4,"Anderson, Carol",10069,0,2,0,5,5,3,0,50825,...,Webster Butler,39.0,Google Search,Fully Meets,5.00,4,0,2016-02-01,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
306,"Woodson, Jason",10135,0,0,1,1,5,3,0,65893,...,Kissy Sullivan,20.0,LinkedIn,Fully Meets,4.07,4,0,2019-02-28,0,13
307,"Ybarra, Catherine",10301,0,0,0,5,5,1,0,48513,...,Brannon Miller,12.0,Google Search,PIP,3.20,2,0,2015-09-02,5,4
308,"Zamora, Jennifer",10010,0,0,0,1,3,4,0,220450,...,Janet King,2.0,Employee Referral,Exceeds,4.60,5,6,2019-02-21,0,16
309,"Zhou, Julia",10043,0,0,0,1,3,3,0,89292,...,Simon Roup,4.0,Employee Referral,Fully Meets,5.00,3,5,2019-02-01,0,11


In [104]:
# Calculate BANs

def aggregate_var(df, column_name, var_type = 'continuous'):
    """
    Function that calculates the aggregation of a continuous or categorical variable for the BANs
    """

    res_list = []

    if var_type == 'continuous':
        res_list = df[column_name].mean(),df[column_name].median()
    elif var_type == 'categorical':
        length_no_miss = df[column_name].notna().sum()
        res_list = np.round((df[column_name].isin(['4', '5']).sum() / length_no_miss) * 100, 2), np.round((df[column_name].isin(['1','2','3']).sum() / length_no_miss) * 100, 2)
    elif var_type == 'count':
        res_list = df[column_name].notna().sum()

    return(res_list)

#count
EmpIDRes = len(hr_df.EmpID.unique())
#cont
EngagementSurveyRes = aggregate_var(df = hr_df, column_name = 'EngagementSurvey', var_type = 'continuous')
SalaryRes = aggregate_var(df = hr_df, column_name = 'Salary', var_type = 'continuous')
#cat
EmpSatisfactionRes = aggregate_var(df = hr_df, column_name = 'EmpSatisfaction', var_type = 'categorical')
PerfScoreIDRes = aggregate_var(df = hr_df, column_name = 'PerfScoreID', var_type = 'categorical')




In [99]:
# get unique values for department
dept_select = hr_df.Department.unique()
position_select = hr_df.Position.unique() #consider not providing executive positions here.
position_select

array(['Production Technician I', 'Sr. DBA', 'Production Technician II',
       'Software Engineer', 'IT Support', 'Data Analyst',
       'Database Administrator', 'Enterprise Architect', 'Sr. Accountant',
       'Production Manager', 'Accountant I', 'Area Sales Manager',
       'Software Engineering Manager', 'BI Director',
       'Director of Operations', 'Sr. Network Engineer', 'Sales Manager',
       'BI Developer', 'IT Manager - Support', 'Network Engineer',
       'IT Director', 'Director of Sales', 'Administrative Assistant',
       'President & CEO', 'Senior BI Developer',
       'Shared Services Manager', 'IT Manager - Infra',
       'Principal Data Architect', 'Data Architect', 'IT Manager - DB',
       'Data Analyst ', 'CIO'], dtype=object)

## Data Visualization


In [128]:
# data wrangle for questionnaires
keep_variables = ['Department', 'EmpSatisfaction', '']
hr_df.melt(id_vars= ['Department'], var_name = 'EmpSatisfaction', value_name = 'resp_count')

Unnamed: 0,Department,EmpSatisfaction,resp_count
0,Production,Employee_Name,"Adinolfi, Wilson K"
1,IT/IS,Employee_Name,"Ait Sidi, Karthikeyan"
2,Production,Employee_Name,"Akinkuolie, Sarah"
3,Production,Employee_Name,"Alagbe,Trina"
4,Production,Employee_Name,"Anderson, Carol"
...,...,...,...
10880,Production,Absences,13
10881,Production,Absences,4
10882,IT/IS,Absences,16
10883,IT/IS,Absences,11


In [None]:
# Create the stacked bar chart
hr_df.groupby(['Department', 'EmpSatisfaction'])
hr_df.melt(id_vars='Question', var_name='Response', value_name='Count')

fig = px.bar(hr_df, x='Question', y='Count', color='Response', title='5-Point Likert Scale Responses')

# Update the layout to make it a stacked bar chart
fig.update_layout(barmode='stack', xaxis_title='Questions', yaxis_title='Count', legend_title='Responses')

# Show the figure
fig.show()

In [129]:
# data visualization
px.histogram(hr_df, x = 'EngagementSurvey', y = 'Department', histfunc='avg')

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

### Exploratory Data Analysis

In [None]:
hr_df

### Planning

- BANs: satisfaction, performance, engagement, pay

     - Categorical Group vars: department, employment status, position, voluntarily left 

     - continuous group/filter vars: DateofHire

#### Potential Dashboard Design

Some great dashboard examples: 
- https://public.tableau.com/app/profile/pradeepkumar.g/viz/HRAttritionDashboardRWFD_16570446563570/viz 
- https://public.tableau.com/app/profile/gandes.goldestan/viz/HRDashboard_16284874251120/Overview
- https://public.tableau.com/app/profile/frankie.benson/viz/HRDashboard-RWFD_16791683724400/HRExecutiveSummary 

#### Potential Research Questions

- RQ 1: how are different departments doing regarding employee performance, engagement, satisfaction, Special Projects, Days Late (controlling for pay)?
     - among employees currently in company
- RQ 2: who are managers with above and beyond scores controlling for department, employee pay.
- RQ3: how does employee performance vary based on Recruitment source? (controll for department, pay)
- RQ4: if data longitudinal, what are the trajectories of performance over last 4 quarters?


In [116]:
# Sample data
data = {
    'Question': ['Q1', 'Q2', 'Q3', 'Q4', 'Q5'],
    'Strongly Disagree': [10, 15, 5, 20, 10],
    'Disagree': [20, 25, 15, 10, 20],
    'Neutral': [30, 20, 25, 30, 25],
    'Agree': [25, 30, 35, 25, 30],
    'Strongly Agree': [15, 10, 20, 15, 15]
}

df = pd.DataFrame(data)
df



Unnamed: 0,Question,Strongly Disagree,Disagree,Neutral,Agree,Strongly Agree
0,Q1,10,20,30,25,15
1,Q2,15,25,20,30,10
2,Q3,5,15,25,35,20
3,Q4,20,10,30,25,15
4,Q5,10,20,25,30,15


In [122]:
# Melt the DataFrame to long format
df_melted = df.melt(id_vars='Question', var_name='Response', value_name='Count')
df_melted

Unnamed: 0,Question,Response,Count
0,Q1,Strongly Disagree,10
1,Q2,Strongly Disagree,15
2,Q3,Strongly Disagree,5
3,Q4,Strongly Disagree,20
4,Q5,Strongly Disagree,10
5,Q1,Disagree,20
6,Q2,Disagree,25
7,Q3,Disagree,15
8,Q4,Disagree,10
9,Q5,Disagree,20


In [126]:


# Create the stacked bar chart
fig = px.bar(df_melted, x='Question', y='Count', color='Response', title='5-Point Likert Scale Responses')
fig

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed