In [45]:
import numpy as np
import seaborn as sns
import pandas as pd

In [46]:
emp=pd.read_csv("/Downloads/employee_attrition.csv")

In [47]:
emp.head(3)

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0


checking for any nulls


In [48]:
emp.isna().sum()

Age                         0
Attrition                   0
BusinessTravel              0
DailyRate                   0
Department                  0
DistanceFromHome            0
Education                   0
EducationField              0
EmployeeCount               0
EmployeeNumber              0
EnvironmentSatisfaction     0
Gender                      0
HourlyRate                  0
JobInvolvement              0
JobLevel                    0
JobRole                     0
JobSatisfaction             0
MaritalStatus               0
MonthlyIncome               0
MonthlyRate                 0
NumCompaniesWorked          0
Over18                      0
OverTime                    0
PercentSalaryHike           0
PerformanceRating           0
RelationshipSatisfaction    0
StandardHours               0
StockOptionLevel            0
TotalWorkingYears           0
TrainingTimesLastYear       0
WorkLifeBalance             0
YearsAtCompany              0
YearsInCurrentRole          0
YearsSince

checking for columns with unique values

In [49]:
for column in emp.columns:
    if emp[column].dtype == object:
        print(str(column) + ' : ' + str(emp[column].unique()))
        print(emp[column].value_counts())

Attrition : ['Yes' 'No']
No     1233
Yes     237
Name: Attrition, dtype: int64
BusinessTravel : ['Travel_Rarely' 'Travel_Frequently' 'Non-Travel']
Travel_Rarely        1043
Travel_Frequently     277
Non-Travel            150
Name: BusinessTravel, dtype: int64
Department : ['Sales' 'Research & Development' 'Human Resources']
Research & Development    961
Sales                     446
Human Resources            63
Name: Department, dtype: int64
EducationField : ['Life Sciences' 'Other' 'Medical' 'Marketing' 'Technical Degree'
 'Human Resources']
Life Sciences       606
Medical             464
Marketing           159
Technical Degree    132
Other                82
Human Resources      27
Name: EducationField, dtype: int64
Gender : ['Female' 'Male']
Male      882
Female    588
Name: Gender, dtype: int64
JobRole : ['Sales Executive' 'Research Scientist' 'Laboratory Technician'
 'Manufacturing Director' 'Healthcare Representative' 'Manager'
 'Sales Representative' 'Research Director' 'Human 

standard hours, over18 and employeeCount column contain only one unique value and employee number is also of no use so we remove these columns as it will not effect our analysis 

In [50]:
emp = emp.drop(['EmployeeNumber','StandardHours','EmployeeCount','Over18'], axis = 1)


In [51]:
emp.head(3)

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,Gender,...,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,2,Female,...,3,1,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,3,Male,...,4,4,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,4,Male,...,3,2,0,7,3,3,0,0,0,0


transforming non numeric columns into numeric columns so machine learning models can run easily

In [52]:
from sklearn.preprocessing import LabelEncoder
for column in emp.columns:
        if emp[column].dtype == np.number:
            continue
        emp[column] = LabelEncoder().fit_transform(emp[column])

  if emp[column].dtype == np.number:


In [53]:
emp.head(3)

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,Gender,...,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,23,1,2,624,2,0,1,1,1,0,...,0,0,0,8,0,0,6,4,0,5
1,31,0,1,113,1,7,0,1,2,1,...,1,3,1,10,3,2,10,7,1,7
2,19,1,2,805,1,1,1,4,3,1,...,0,1,0,7,3,2,0,0,0,0


we will bring attrition column with first index and remove age from first index

In [54]:
emp['Age_Years'] = emp['Age']
emp= emp.drop('Age', axis = 1)

In [55]:
emp.shape[1]

31

In [56]:
x = emp.iloc[:, 1:emp.shape[1]].values 
y = emp.iloc[:, 0].values 

In [57]:
y

array([1, 0, 1, ..., 0, 0, 0])

In [58]:
x

array([[  2, 624,   2, ...,   0,   5,  23],
       [  1, 113,   1, ...,   1,   7,  31],
       [  2, 805,   1, ...,   0,   0,  19],
       ...,
       [  2,  39,   1, ...,   0,   3,   9],
       [  1, 579,   2, ...,   0,   8,  31],
       [  2, 336,   1, ...,   1,   2,  16]], dtype=int64)

split data into test train

In [59]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.25, random_state = 0)

Run the models

In [60]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(x_train,y_train)
y_pred=knn.predict(x_test)
from sklearn.metrics import accuracy_score
accuracy_knn=accuracy_score(y_test,y_pred)



In [61]:
accuracy_knn

0.7934782608695652

In [62]:
from sklearn.ensemble import RandomForestClassifier
rm=RandomForestClassifier(n_estimators = 100)
rm.fit(x_train,y_train)
y_pred_rm=rm.predict(x_test)
accuracy_rm=accuracy_score(y_test,y_pred_rm)


In [63]:
accuracy_rm

0.8641304347826086