In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import plotly.express as px

# We will ask following questions and investigate
1. How does Attrition vary with age?
2. Is there a relationship between gender and attrition?
3. Does salary impact attrition?
4. How does Job Satisfaction relate to Attrition?
5. Does BusinessTravel correlate with attrition?
6. What is the impact of over-time on attrition?
7. How do education levels affect attrition?
8. Is job role a significant factor in attrition?
9. Does distance from home influence attrition?
10. Does work-life balance affect attrition?
11. How does the number of companies worked for impact attrition?
12. What is the relationship between job level and attrition?
13. Is there a correlation between total years of work experience and attrition?
14. Does environment satisfaction influence attrition?
15. What is the correlation between performance rating and attrition?
16. What role does marital status play in attrition?
17. How does overtime affect job satisfaction?

# Fields available in this dataset

1. Age                      <int>
1. Attrition                <cat>
1. BusinessTravel           <cat>
1. DailyRate                <int>
1. Department               <cat>
1. DistanceFromHome         <int>
1. Education                <int>
1. EducationField           <cat>
1. EmployeeCount            <int>
1. EmployeeNumber           <int>
1. EnvironmentSatisfaction  <int>
1. Gender                   <cat>
1. HourlyRate               <int>
1. JobInvolvement           <int>
1. JobLevel                 <int>
1. JobRole                  <cat>
1. JobSatisfaction          <int>
1. MaritalStatus            <cat>
1. MonthlyIncome            <int>
1. MonthlyRate              <int>
1. NumCompaniesWorked       <int>
1. Over18                   <cat>
1. OverTime                 <cat>
1. PercentSalaryHike        <int>
1. PerformanceRating        <int>
1. RelationshipSatisfaction <int>
1. StandardHours            <int>
1. StockOptionLevel         <int>
1. TotalWorkingYears        <int>
1. TrainingTimesLastYear    <int>
1. WorkLifeBalance          <int>
1. YearsAtCompany           <int>
1. YearsInCurrentRole       <int>
1. YearsSinceLastPromotion  <int>
1. YearsWithCurrManager     <int>

In [5]:
url="https://github.com/dasarpai/DAI-Datasets/blob/main/WA_Fn-UseC_-HR-Employee-Attrition.csv"

url = "https://raw.githubusercontent.com/dasarpai/DAI-Datasets/main/WA_Fn-UseC_-HR-Employee-Attrition.csv"

df = pd.read_csv(url)

In [6]:
df

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1465,36,No,Travel_Frequently,884,Research & Development,23,2,Medical,1,2061,...,3,80,1,17,3,3,5,2,0,3
1466,39,No,Travel_Rarely,613,Research & Development,6,1,Medical,1,2062,...,1,80,1,9,5,3,7,7,1,7
1467,27,No,Travel_Rarely,155,Research & Development,4,3,Life Sciences,1,2064,...,2,80,1,6,0,3,6,2,0,3
1468,49,No,Travel_Frequently,1023,Sales,2,3,Medical,1,2065,...,4,80,0,17,3,2,9,6,0,8


In [7]:
# Loop through each column in the DataFrame to get all categorical fields and unique values
for column in df.select_dtypes(include=['object', 'category']).columns:
    print(f"Unique values in '{column}':")
    print(df[column].unique())
    print("-" * 50)  # Separator line for readability


Unique values in 'Attrition':
['Yes' 'No']
--------------------------------------------------
Unique values in 'BusinessTravel':
['Travel_Rarely' 'Travel_Frequently' 'Non-Travel']
--------------------------------------------------
Unique values in 'Department':
['Sales' 'Research & Development' 'Human Resources']
--------------------------------------------------
Unique values in 'EducationField':
['Life Sciences' 'Other' 'Medical' 'Marketing' 'Technical Degree'
 'Human Resources']
--------------------------------------------------
Unique values in 'Gender':
['Female' 'Male']
--------------------------------------------------
Unique values in 'JobRole':
['Sales Executive' 'Research Scientist' 'Laboratory Technician'
 'Manufacturing Director' 'Healthcare Representative' 'Manager'
 'Sales Representative' 'Research Director' 'Human Resources']
--------------------------------------------------
Unique values in 'MaritalStatus':
['Single' 'Married' 'Divorced']
------------------------------

In [8]:
df.isna().sum()

Unnamed: 0,0
Age,0
Attrition,0
BusinessTravel,0
DailyRate,0
Department,0
DistanceFromHome,0
Education,0
EducationField,0
EmployeeCount,0
EmployeeNumber,0


In [9]:
# 1. How does Attrition vary with age?
fig = px.box(df, x='Attrition', y='Age', points="all", title="Attrition by Age")
fig.show()

In [10]:
# 2. Is there a relationship between gender and attrition?
fig = px.histogram(df, x='Gender', color='Attrition', barmode='stack', title="Attrition by Gender")
fig.show()


In [11]:
# 2. Is there a relationship between gender and attrition?
gender_attrition = df.groupby(['Gender', 'Attrition']).size().reset_index(name='count')
gender_attrition['percentage'] = gender_attrition.groupby('Gender')['count'].transform(lambda x: x / x.sum() * 100)
fig = px.bar(gender_attrition, x='Gender', y='percentage', color='Attrition', barmode='stack', title="Attrition by Gender (Percentage)")
fig.show()

In [12]:
# 3. Does salary impact attrition?
fig = px.scatter(df, x='MonthlyIncome', y='Attrition', title="Attrition vs Monthly Income")
fig.show()

In [13]:
# 4. How does Job Satisfaction relate to Attrition?
fig = px.box(df, x='Attrition', y='JobSatisfaction', title="Job Satisfaction by Attrition")
fig.show()


In [14]:
# 5. Does working remotely (BusinessTravel) correlate with attrition?
fig = px.histogram(df, x='BusinessTravel', color='Attrition', barmode='group', title="Attrition by Business Travel")
fig.show()

In [15]:
# 5. Does working remotely (BusinessTravel) correlate with attrition?
business_travel_attrition = df.groupby(['BusinessTravel', 'Attrition']).size().reset_index(name='count')
business_travel_attrition['percentage'] = business_travel_attrition.groupby('BusinessTravel')['count'].transform(lambda x: x / x.sum() * 100)
fig = px.bar(business_travel_attrition, x='BusinessTravel', y='percentage', color='Attrition', barmode='group', title="Attrition by Business Travel (Percentage)")
fig.show()

In [16]:
# 6. What is the impact of over-time on attrition?
fig = px.pie(df, names='OverTime', color='Attrition', title="Attrition by OverTime")
fig.show()

In [17]:
# Calculate percentage of attrition within each OverTime category
overtime_attrition = df.groupby(['OverTime', 'Attrition']).size().reset_index(name='count')
overtime_attrition['percentage'] = overtime_attrition.groupby('OverTime')['count'].transform(lambda x: x / x.sum() * 100)

# Plot as a stacked bar chart
fig = px.bar(overtime_attrition, x='OverTime', y='percentage', color='Attrition',
             title="Attrition by OverTime (Percentage)", barmode='stack')
fig.show()


In [18]:
df

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1465,36,No,Travel_Frequently,884,Research & Development,23,2,Medical,1,2061,...,3,80,1,17,3,3,5,2,0,3
1466,39,No,Travel_Rarely,613,Research & Development,6,1,Medical,1,2062,...,1,80,1,9,5,3,7,7,1,7
1467,27,No,Travel_Rarely,155,Research & Development,4,3,Life Sciences,1,2064,...,2,80,1,6,0,3,6,2,0,3
1468,49,No,Travel_Frequently,1023,Sales,2,3,Medical,1,2065,...,4,80,0,17,3,2,9,6,0,8


In [19]:
# 7. How do education levels affect attrition?
df['Education'] = df['Education'].astype(str)  # or `astype('category')

fig = px.bar(df, x='Education', y=df.index, color='Attrition',  barmode='group', title="Attrition by Education Level")
fig.show()

In [20]:
# 7. How do education levels affect attrition?
education_attrition = df.groupby(['Education', 'Attrition']).size().reset_index(name='count')
education_attrition['percentage'] = education_attrition.groupby('Education')['count'].transform(lambda x: x / x.sum() * 100)
fig = px.bar(education_attrition, x='Education', y='percentage', color='Attrition', barmode='group', title="Attrition by Education Level (Percentage)")
fig.show()

In [21]:
# 8. Is job role a significant factor in attrition?
fig = px.histogram(df, x='JobRole', color='Attrition', barmode='stack', title="Attrition by Job Role")
fig.show()

In [22]:
# 8. Is job role a significant factor in attrition?
job_role_attrition = df.groupby(['JobRole', 'Attrition']).size().reset_index(name='count')
job_role_attrition['percentage'] = job_role_attrition.groupby('JobRole')['count'].transform(lambda x: x / x.sum() * 100)
fig = px.bar(job_role_attrition, x='JobRole', y='percentage', color='Attrition', barmode='stack', title="Attrition by Job Role (Percentage)")
fig.show()

In [23]:
# 9. Does distance from home influence attrition?
fig = px.scatter(df, x='DistanceFromHome', y='Attrition', title="Attrition vs Distance from Home")
fig.show()

In [24]:
# 10. Does work-life balance affect attrition?
fig = px.box(df, x='Attrition', y='WorkLifeBalance', title="Work-Life Balance by Attrition")
fig.show()

In [25]:
# 11. How does the number of companies worked for impact attrition?
fig = px.histogram(df, x='NumCompaniesWorked', color='Attrition', barmode='group', title="Attrition by Number of Companies Worked")
fig.show()

In [26]:
# 11. How does the number of companies worked for impact attrition?
num_companies_attrition = df.groupby(['NumCompaniesWorked', 'Attrition']).size().reset_index(name='count')
num_companies_attrition['percentage'] = num_companies_attrition.groupby('NumCompaniesWorked')['count'].transform(lambda x: x / x.sum() * 100)
fig = px.bar(num_companies_attrition, x='NumCompaniesWorked', y='percentage', color='Attrition', barmode='group', title="Attrition by Number of Companies Worked (Percentage)")
fig.show()

In [27]:
# 12. What is the relationship between job level and attrition?

# Convert JobLevel to a categorical type if it's not already
df['JobLevel'] = df['JobLevel'].astype(str)  # or `astype('category')`

# Create the bar chart
fig = px.histogram(df, x='JobLevel', color='Attrition', barmode='group',
                   title="Attrition by Job Level",
                   category_orders={"JobLevel": sorted(df['JobLevel'].unique())})  # Ordering levels

fig.show()

In [28]:
# 12. What is the relationship between job level and attrition?
job_level_attrition = df.groupby(['JobLevel', 'Attrition']).size().reset_index(name='count')
job_level_attrition['percentage'] = job_level_attrition.groupby('JobLevel')['count'].transform(lambda x: x / x.sum() * 100)
fig = px.bar(job_level_attrition, x='JobLevel', y='percentage', color='Attrition', barmode='group', title="Attrition by Job Level (Percentage)")
fig.show()

In [29]:
# 13. Is there a correlation between total years of work experience and attrition?
fig = px.scatter(df, x='TotalWorkingYears', y='Attrition', title="Attrition vs Total Working Years")
fig.show()

In [30]:
# 14. Does environment satisfaction influence attrition?
fig = px.box(df, x='Attrition', y='EnvironmentSatisfaction', title="Environment Satisfaction by Attrition")
fig.show()


In [31]:
# 16. What role does marital status play in attrition?
fig = px.histogram(df, x='MaritalStatus', color='Attrition', barmode='stack', title="Attrition by Marital Status")
fig.show()

In [32]:
# 16. What role does marital status play in attrition?
marital_status_attrition = df.groupby(['MaritalStatus', 'Attrition']).size().reset_index(name='count')
marital_status_attrition['percentage'] = marital_status_attrition.groupby('MaritalStatus')['count'].transform(lambda x: x / x.sum() * 100)
fig = px.bar(marital_status_attrition, x='MaritalStatus', y='percentage', color='Attrition', barmode='stack', title="Attrition by Marital Status (Percentage)")
fig.show()

In [33]:
# 17. How does overtime affect job satisfaction?
fig = px.box(df, x='OverTime', y='JobSatisfaction', color='Attrition', title="Job Satisfaction by OverTime and Attrition")
fig.show()