In [19]:
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

from sklearn.model_selection import train_test_split,GridSearchCV,StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix,roc_auc_score
from sklearn.preprocessing import StandardScaler

In [20]:
df=pd.read_csv('WA_Fn-UseC_-HR-Employee-Attrition.csv')
df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2


In [21]:
df.columns

Index(['Age', 'Attrition', 'BusinessTravel', 'DailyRate', 'Department',
       'DistanceFromHome', 'Education', 'EducationField', 'EmployeeCount',
       'EmployeeNumber', 'EnvironmentSatisfaction', 'Gender', 'HourlyRate',
       'JobInvolvement', 'JobLevel', 'JobRole', 'JobSatisfaction',
       'MaritalStatus', 'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked',
       'Over18', 'OverTime', 'PercentSalaryHike', 'PerformanceRating',
       'RelationshipSatisfaction', 'StandardHours', 'StockOptionLevel',
       'TotalWorkingYears', 'TrainingTimesLastYear', 'WorkLifeBalance',
       'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion',
       'YearsWithCurrManager'],
      dtype='object')

In [22]:
df.shape

(1470, 35)

In [23]:
df.isnull().sum()

Age                         0
Attrition                   0
BusinessTravel              0
DailyRate                   0
Department                  0
DistanceFromHome            0
Education                   0
EducationField              0
EmployeeCount               0
EmployeeNumber              0
EnvironmentSatisfaction     0
Gender                      0
HourlyRate                  0
JobInvolvement              0
JobLevel                    0
JobRole                     0
JobSatisfaction             0
MaritalStatus               0
MonthlyIncome               0
MonthlyRate                 0
NumCompaniesWorked          0
Over18                      0
OverTime                    0
PercentSalaryHike           0
PerformanceRating           0
RelationshipSatisfaction    0
StandardHours               0
StockOptionLevel            0
TotalWorkingYears           0
TrainingTimesLastYear       0
WorkLifeBalance             0
YearsAtCompany              0
YearsInCurrentRole          0
YearsSince

In [24]:
df.select_dtypes(include='number').skew()

Age                         0.413286
DailyRate                  -0.003519
DistanceFromHome            0.958118
Education                  -0.289681
EmployeeCount               0.000000
EmployeeNumber              0.016574
EnvironmentSatisfaction    -0.321654
HourlyRate                 -0.032311
JobInvolvement             -0.498419
JobLevel                    1.025401
JobSatisfaction            -0.329672
MonthlyIncome               1.369817
MonthlyRate                 0.018578
NumCompaniesWorked          1.026471
PercentSalaryHike           0.821128
PerformanceRating           1.921883
RelationshipSatisfaction   -0.302828
StandardHours               0.000000
StockOptionLevel            0.968980
TotalWorkingYears           1.117172
TrainingTimesLastYear       0.553124
WorkLifeBalance            -0.552480
YearsAtCompany              1.764529
YearsInCurrentRole          0.917363
YearsSinceLastPromotion     1.984290
YearsWithCurrManager        0.833451
dtype: float64

In [25]:
from sklearn.preprocessing import power_transform

numeric_cols = df.select_dtypes(include='number').columns
df[numeric_cols] = power_transform(df[numeric_cols])

In [26]:
df[numeric_cols].skew()

Age                        -0.007603
DailyRate                  -0.196160
DistanceFromHome           -0.007468
Education                  -0.099881
EmployeeCount               0.000000
EmployeeNumber             -0.288025
EnvironmentSatisfaction    -0.205395
HourlyRate                 -0.106461
JobInvolvement             -0.020365
JobLevel                    0.103624
JobSatisfaction            -0.212815
MonthlyIncome               0.032831
MonthlyRate                -0.184087
NumCompaniesWorked          0.014814
PercentSalaryHike           0.116250
PerformanceRating           0.000000
RelationshipSatisfaction   -0.193634
StandardHours               0.000000
StockOptionLevel            0.087019
TotalWorkingYears          -0.010653
TrainingTimesLastYear       0.058058
WorkLifeBalance            -0.012286
YearsAtCompany             -0.008683
YearsInCurrentRole         -0.060849
YearsSinceLastPromotion     0.212129
YearsWithCurrManager       -0.067564
dtype: float64

In [27]:
df=df.drop(['EmployeeCount','EmployeeNumber','MonthlyRate','HourlyRate','DailyRate','Over18','StandardHours'],axis=1)
df.shape

(1470, 28)

In [28]:
df['Attrition'] = df['Attrition'].map({'Yes': 1, 'No': 0})

In [29]:
x = df.select_dtypes(include='number').drop(columns=['Attrition'])
y=df['Attrition']

In [30]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)

In [31]:
Scalar=StandardScaler()
x_train_scaled=Scalar.fit_transform(x_train)
x_test_scaled=Scalar.transform(x_test)

In [32]:
model=XGBClassifier()
model.fit(x_train_scaled,y_train)
pred_y=model.predict(x_test_scaled)
print(accuracy_score(y_test,pred_y))
print(classification_report(y_test,pred_y))

0.8503401360544217
              precision    recall  f1-score   support

           0       0.89      0.95      0.92       255
           1       0.39      0.23      0.29        39

    accuracy                           0.85       294
   macro avg       0.64      0.59      0.60       294
weighted avg       0.82      0.85      0.83       294



In [33]:
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42) #stratifiesdCV due to uneven distribution of data
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 6],
    'learning_rate': [0.1, 0.2,0.3],
    'subsample': [0.7, 0.8,0.9],
    'colsample_bytree': [0.7, 0.8,0.9]
}
grid = GridSearchCV(model, param_grid, scoring='f1', cv=cv)
grid.fit(x_train_scaled, y_train)
print("Best params:", grid.best_params_)
print("Best CV F1 score:", grid.best_score_)
best_model = grid.best_estimator_
pred_test = best_model.predict(x_test_scaled)

Best params: {'colsample_bytree': 0.7, 'learning_rate': 0.2, 'max_depth': 3, 'n_estimators': 100, 'subsample': 0.7}
Best CV F1 score: 0.3997949735449735


In [34]:
pred_train = best_model.predict(x_train_scaled)
print(accuracy_score(y_test,pred_test))
print(classification_report(y_test,pred_test))

0.8503401360544217
              precision    recall  f1-score   support

           0       0.90      0.94      0.92       255
           1       0.41      0.28      0.33        39

    accuracy                           0.85       294
   macro avg       0.65      0.61      0.62       294
weighted avg       0.83      0.85      0.84       294



In [35]:
model2=RandomForestClassifier(random_state=42)
model2.fit(x_train,y_train)
pred_y2=model2.predict(x_test)
print(accuracy_score(y_test,pred_y2))
print(classification_report(y_test,pred_y2))

0.8673469387755102
              precision    recall  f1-score   support

           0       0.88      0.99      0.93       255
           1       0.50      0.08      0.13        39

    accuracy                           0.87       294
   macro avg       0.69      0.53      0.53       294
weighted avg       0.83      0.87      0.82       294



In [36]:
import pickle as pkl

with open('attrition.pkl','wb') as f:
    pkl.dump((Scalar,model,model2),f)