In [1]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns

Creating dataframe from dataset

In [2]:
df = pd.read_csv('hr_employee_churn_data.csv')

Check the correcly upload

In [3]:
df.head(5)

Unnamed: 0,empid,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,salary,left
0,1,0.38,0.53,2,157,3,0,0,low,1
1,2,0.8,0.86,5,262,6,0,0,medium,1
2,3,0.11,0.88,7,272,4,0,0,medium,1
3,4,0.72,0.87,5,223,5,0,0,low,1
4,5,0.37,0.52,2,159,3,0,0,low,1


Check rows vs columns

In [4]:
df.shape

(14999, 10)

Checking structure

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14999 entries, 0 to 14998
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   empid                  14999 non-null  int64  
 1   satisfaction_level     14997 non-null  float64
 2   last_evaluation        14999 non-null  float64
 3   number_project         14999 non-null  int64  
 4   average_montly_hours   14999 non-null  int64  
 5   time_spend_company     14999 non-null  int64  
 6   Work_accident          14999 non-null  int64  
 7   promotion_last_5years  14999 non-null  int64  
 8   salary                 14999 non-null  object 
 9   left                   14999 non-null  int64  
dtypes: float64(2), int64(7), object(1)
memory usage: 1.1+ MB


#Feature Engineering

In [6]:
df1 = df.copy()

In [7]:
df1.drop(['empid'],axis=1, inplace=True)

In [8]:
df1.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,salary,left
0,0.38,0.53,2,157,3,0,0,low,1
1,0.8,0.86,5,262,6,0,0,medium,1
2,0.11,0.88,7,272,4,0,0,medium,1
3,0.72,0.87,5,223,5,0,0,low,1
4,0.37,0.52,2,159,3,0,0,low,1


Missing values

In [9]:
df1.isnull().sum()

satisfaction_level       2
last_evaluation          0
number_project           0
average_montly_hours     0
time_spend_company       0
Work_accident            0
promotion_last_5years    0
salary                   0
left                     0
dtype: int64

In [10]:
df1['satisfaction_level'].describe()

count    14997.000000
mean         0.612863
std          0.248634
min          0.090000
25%          0.440000
50%          0.640000
75%          0.820000
max          1.000000
Name: satisfaction_level, dtype: float64

In [11]:
df1['satisfaction_level'].fillna(df1['satisfaction_level'].mean(),inplace=True)

In [12]:
df1.isnull().sum()

satisfaction_level       0
last_evaluation          0
number_project           0
average_montly_hours     0
time_spend_company       0
Work_accident            0
promotion_last_5years    0
salary                   0
left                     0
dtype: int64

In [13]:
df1['salary'].unique()

array(['low', 'medium', 'high'], dtype=object)

In [16]:
salary_dumm = pd.get_dummies(df1['salary'])

In [17]:
salary_dumm

Unnamed: 0,high,low,medium
0,False,True,False
1,False,False,True
2,False,False,True
3,False,True,False
4,False,True,False
...,...,...,...
14994,False,True,False
14995,False,True,False
14996,False,True,False
14997,False,True,False


In [18]:
df1 = pd.concat([df1,salary_dumm],axis=1)

In [19]:
df1.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,salary,left,high,low,medium
0,0.38,0.53,2,157,3,0,0,low,1,False,True,False
1,0.8,0.86,5,262,6,0,0,medium,1,False,False,True
2,0.11,0.88,7,272,4,0,0,medium,1,False,False,True
3,0.72,0.87,5,223,5,0,0,low,1,False,True,False
4,0.37,0.52,2,159,3,0,0,low,1,False,True,False


In [20]:
df1.drop(['salary'], axis=1, inplace=True)

In [21]:
df1.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,left,high,low,medium
0,0.38,0.53,2,157,3,0,0,1,False,True,False
1,0.8,0.86,5,262,6,0,0,1,False,False,True
2,0.11,0.88,7,272,4,0,0,1,False,False,True
3,0.72,0.87,5,223,5,0,0,1,False,True,False
4,0.37,0.52,2,159,3,0,0,1,False,True,False


In [31]:
df1.isnull().sum()

satisfaction_level       0
last_evaluation          0
number_project           0
average_montly_hours     0
time_spend_company       0
Work_accident            0
promotion_last_5years    0
left                     0
high                     0
low                      0
medium                   0
dtype: int64

In [32]:
df2 = df1.copy()

In [33]:
df2.drop(['high'], axis=1, inplace=True)

In [34]:
df2.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,left,low,medium
0,0.38,0.53,2,157,3,0,0,1,True,False
1,0.8,0.86,5,262,6,0,0,1,False,True
2,0.11,0.88,7,272,4,0,0,1,False,True
3,0.72,0.87,5,223,5,0,0,1,True,False
4,0.37,0.52,2,159,3,0,0,1,True,False


# Split Dataset into Training set and Test set

In [35]:
x = df2.drop(labels='left', axis=1)
y = df2['left']

splitting into train and test

In [23]:
from sklearn.model_selection import train_test_split

In [36]:
x_train, x_test,y_train,y_test = train_test_split(x,y, test_size=0.2, random_state=0)

In [37]:
len(x_train)

11999

In [38]:
len(x_test)

3000

# Model Selection

In [39]:
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV

In [44]:
model_param = {
    'RandomForestClassifier':{
        'model':RandomForestClassifier(),
        'param':{
            'n_estimators':[10,50,100],
            'criterion':['gini','entropy','entropy'],
            'max_depth': range(2,4,1),
            'max_features':['auto','log2','log2']
        }
        
    },
    'XGBClassifier':{
        'model':XGBClassifier(objective='binary:logistic'),
        'param':{
            'learning_rate':[0.5, 0.1, 0.01, 0.001],
            'max_depth':[3, 5, 10, 20],
            'n_estimators':[10, 50, 100, 200]
        }
    }
}

In [45]:
scores = []

for model_name, mp in model_param.items():
    model_selection = GridSearchCV(estimator=mp['model'],param_grid=mp['param'],cv=5,return_train_score=False)
    model_selection.fit(x,y)
    scores.append({
        'model':model_name,
        'best_score': model_selection.best_score_,
        'best_params': model_selection.best_params_
    })

In [46]:
scores

[{'model': 'XGBClassifier',
  'best_score': 0.9909995109480938,
  'best_params': {'learning_rate': 0.1, 'max_depth': 20, 'n_estimators': 100}}]