In [6]:
# Importing Libraries
import datetime
from statistics import stdev
from tqdm import tqdm

import warnings
warnings.filterwarnings('ignore')

import matplotlib.colors as mcolors
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import plotly.express as px
import plotly.figure_factory as ff
import plotly.graph_objects as go

from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (accuracy_score, auc, classification_report,
                             confusion_matrix, precision_recall_curve,
                             roc_auc_score, roc_curve)
from sklearn.model_selection import (GridSearchCV, RepeatedStratifiedKFold,
                                     cross_val_score, train_test_split)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

In [7]:
# Uploading dataset
df = pd.read_csv("HR-Employee-Attrition.csv")

In [8]:
# Data Analysis
def analyze(df):
    info = pd.DataFrame()
    info['data type'] = df.dtypes
    info['missing values'] = df.isnull().sum()
    info['No. unique'] = df.apply(lambda x: len(x.unique()))
    info['unique values'] = df.apply(lambda x: x.unique())
    return info.sort_values('data type')
    
analyze(df)

Unnamed: 0,data type,missing values,No. unique,unique values
Age,int64,0,43,"[41, 49, 37, 33, 27, 32, 59, 30, 38, 36, 35, 2..."
YearsInCurrentRole,int64,0,19,"[4, 7, 0, 2, 5, 9, 8, 3, 6, 13, 1, 15, 14, 16,..."
YearsAtCompany,int64,0,37,"[6, 10, 0, 8, 2, 7, 1, 9, 5, 4, 25, 3, 12, 14,..."
WorkLifeBalance,int64,0,4,"[1, 3, 2, 4]"
TrainingTimesLastYear,int64,0,7,"[0, 3, 2, 5, 1, 4, 6]"
TotalWorkingYears,int64,0,40,"[8, 10, 7, 6, 12, 1, 17, 5, 3, 31, 13, 0, 26, ..."
StockOptionLevel,int64,0,4,"[0, 1, 3, 2]"
StandardHours,int64,0,1,[80]
RelationshipSatisfaction,int64,0,4,"[1, 4, 2, 3]"
PerformanceRating,int64,0,2,"[3, 4]"


In [9]:
# Uncessary columns
df = df.drop(['EmployeeCount', 'EmployeeNumber', 'StandardHours', 'Over18'], axis=1)

In [20]:
# Calculate the attrition distribution as a percentage
attrition_distribution = df['Attrition'].value_counts(normalize=True) * 100

fig = px.pie(
    values=attrition_distribution,
    names=attrition_distribution.index,
    title='<b>Attrition Status Distribution</b>',
    labels={'names': 'Attrition Status', 'values': 'Percentage'},
    height=400,
    color_discrete_sequence=['#A6AEFF', '#09B39C']  
)

fig.update_layout(
    title_x=0.5,
    legend=dict(
        orientation='h',  
        yanchor='bottom',
        y=-0.2, 
        xanchor='center',
        x=0.5
    ),
    font_color='#4A4A4A',  
    paper_bgcolor='#DADCE0', 
    plot_bgcolor='#DADCE0'
)

fig.show()


In [24]:
attrition_yes = df[df['Attrition'] == 'Yes']
attrition_gender_counts = attrition_yes['Gender'].value_counts()

fig = px.pie(values=attrition_gender_counts, names=attrition_gender_counts.index, 
             title='<b>Attrition by Gender</b>',
             labels={'names': 'Gender', 'values': 'Count'}, height=400,                     
             color_discrete_sequence=['#A6AEFF', '#09B39C'] )

fig.update_layout(
    title_x=0.5,
    legend=dict(
        orientation='h',  
        yanchor='bottom',
        y=-0.2, 
        xanchor='center',
        x=0.5
    ),
    font_color='#4A4A4A',  
    paper_bgcolor='#DADCE0', 
    plot_bgcolor='#DADCE0'
)

fig.show()