# Data Pre-processing

In [17]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [3]:
df=pd.read_csv('HR_comma_sep.csv')

In [4]:
df.columns

Index(['satisfaction_level', 'last_evaluation', 'number_project',
       'average_montly_hours', 'time_spend_company', 'Work_accident', 'left',
       'promotion_last_5years', 'Department', 'salary'],
      dtype='object')

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14999 entries, 0 to 14998
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   satisfaction_level     14999 non-null  float64
 1   last_evaluation        14999 non-null  float64
 2   number_project         14999 non-null  int64  
 3   average_montly_hours   14999 non-null  int64  
 4   time_spend_company     14999 non-null  int64  
 5   Work_accident          14999 non-null  int64  
 6   left                   14999 non-null  int64  
 7   promotion_last_5years  14999 non-null  int64  
 8   Department             14999 non-null  object 
 9   salary                 14999 non-null  object 
dtypes: float64(2), int64(6), object(2)
memory usage: 1.1+ MB


In [6]:
df.describe()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years
count,14999.0,14999.0,14999.0,14999.0,14999.0,14999.0,14999.0,14999.0
mean,0.612834,0.716102,3.803054,201.050337,3.498233,0.14461,0.238083,0.021268
std,0.248631,0.171169,1.232592,49.943099,1.460136,0.351719,0.425924,0.144281
min,0.09,0.36,2.0,96.0,2.0,0.0,0.0,0.0
25%,0.44,0.56,3.0,156.0,3.0,0.0,0.0,0.0
50%,0.64,0.72,4.0,200.0,3.0,0.0,0.0,0.0
75%,0.82,0.87,5.0,245.0,4.0,0.0,0.0,0.0
max,1.0,1.0,7.0,310.0,10.0,1.0,1.0,1.0


In [7]:
df.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,Department,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.8,0.86,5,262,6,0,1,0,sales,medium
2,0.11,0.88,7,272,4,0,1,0,sales,medium
3,0.72,0.87,5,223,5,0,1,0,sales,low
4,0.37,0.52,2,159,3,0,1,0,sales,low


In [10]:
feats = ['Department','salary']#defines the two columns for which you want to create dummy variables.
df_final = pd.get_dummies(df,columns=feats,drop_first=True)#will generate the numerical variables that your employee retention model requires

#  Separating Your Training and Testing Datasets

In [11]:
from sklearn.model_selection import train_test_split

In [12]:
X = df_final.drop(['left'],axis=1).values
y = df_final['left'].values

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# Transforming the Data

In [14]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)# scaling the training and testing set
X_test = sc.transform(X_test)

# Building the Artificial Neural Network

In [20]:
import keras
from keras.models import Sequential
from keras.layers import Dense

In [21]:
classifier = Sequential()# initializing a linear stack of layers

In [22]:
classifier.add(Dense(9, kernel_initializer = "uniform",activation = "relu", input_dim=18))

In [23]:
classifier.add(Dense(1, kernel_initializer = "uniform",activation = "sigmoid"))# Sigmoid activation function so that you can get the probability that an employee will leave

In [24]:
classifier.compile(optimizer= "adam",loss = "binary_crossentropy",metrics = ["accuracy"])

In [25]:
classifier.fit(X_train, y_train, batch_size = 10, epochs = 1)



<keras.callbacks.History at 0x2043dd05ee0>

# Running Predictions on the Test Set

In [26]:
y_pred = classifier.predict(X_test)

In [27]:
y_pred = (y_pred > 0.5)

# Checking the Confusion Matrix

In [28]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
cm

array([[3192,  202],
       [ 490,  616]], dtype=int64)

In [29]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
cm

array([[3192,  202],
       [ 490,  616]], dtype=int64)

# Making a Single Prediction

In [30]:
new_pred = classifier.predict(sc.transform(np.array([[0.26,0.7 ,3., 238., 6., 0.,0.,0.,0., 0.,0.,0.,0.,0.,1.,0., 0.,1.]])))

In [31]:
new_pred = (new_pred > 0.5)
new_pred

array([[ True]])

In [32]:
new_pred = (new_pred > 0.6)
new_pred

array([[ True]])

# Improving the Model Accuracy

In [33]:
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import cross_val_score

In [34]:
def make_classifier():
    classifier = Sequential()
    classifier.add(Dense(9, kernel_initializer = "uniform", activation = "relu", input_dim=18))
    classifier.add(Dense(1, kernel_initializer = "uniform", activation = "sigmoid"))
    classifier.compile(optimizer= "adam",loss = "binary_crossentropy",metrics = ["accuracy"])
    return classifier

In [35]:
classifier = KerasClassifier(build_fn = make_classifier, batch_size=10, nb_epoch=1)

  classifier = KerasClassifier(build_fn = make_classifier, batch_size=10, nb_epoch=1)


In [36]:
accuracies = cross_val_score(estimator = classifier,X = X_train,y = y_train,cv = 10,n_jobs = -1)

In [37]:
mean = accuracies.mean()
mean

0.8139782190322876

In [38]:
variance = accuracies.var()
variance

0.0026248170789217795

# Adding Dropout Regularization to Fight Over-Fitting

In [39]:
from keras.layers import Dropout

classifier = Sequential()
classifier.add(Dense(9, kernel_initializer = "uniform", activation = "relu", input_dim=18))
classifier.add(Dropout(rate = 0.1))
classifier.add(Dense(1, kernel_initializer = "uniform", activation = "sigmoid"))
classifier.compile(optimizer= "adam",loss = "binary_crossentropy",metrics = ["accuracy"])

# Hyperparameter Tuning

In [40]:
from sklearn.model_selection import GridSearchCV
def make_classifier(optimizer):
    classifier = Sequential()
    classifier.add(Dense(9, kernel_initializer = "uniform", activation = "relu", input_dim=18))
    classifier.add(Dense(1, kernel_initializer = "uniform", activation = "sigmoid"))
    classifier.compile(optimizer= optimizer,loss = "binary_crossentropy",metrics = ["accuracy"])
    return classifier

In [41]:
classifier = KerasClassifier(build_fn = make_classifier)

  classifier = KerasClassifier(build_fn = make_classifier)


In [42]:
params = {
    'batch_size':[20,35],
    'epochs':[2,3],
    'optimizer':['adam','rmsprop']
}

In [43]:
grid_search = GridSearchCV(estimator=classifier,
                           param_grid=params,
                           scoring="accuracy",
                           cv=2)

In [44]:
grid_search = grid_search.fit(X_train,y_train)

Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3


In [45]:
best_param = grid_search.best_params_
best_accuracy = grid_search.best_score_

In [46]:
best_param

{'batch_size': 20, 'epochs': 3, 'optimizer': 'adam'}

In [48]:
best_accuracy

0.8676038973409901

In [None]:
# Obtained the best accuracy for the classifier as being 86%. 