In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import plotly.offline
from plotly.offline import iplot
import plotly.graph_objs as go
import cufflinks as cf
cf.go_offline()
cf.set_config_file(offline=False, world_readable=True)

### Data We have

In [2]:
data = pd.read_csv('C:\\Users\\evita\\Downloads\\HR_comma_sep.csv', encoding='cp1251', sep =',')
data.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,Department,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.8,0.86,5,262,6,0,1,0,sales,medium
2,0.11,0.88,7,272,4,0,1,0,sales,medium
3,0.72,0.87,5,223,5,0,1,0,sales,low
4,0.37,0.52,2,159,3,0,1,0,sales,low


### Number of employees quitting in each department

In [3]:
k=data[data.left==1].groupby('Department', as_index=False).left.count()
trace = go.Bar(x = k.Department,y = k.left)
layout = go.Layout(title='Number of employees quitting in each department')

fig = go.Figure(data = [trace], layout = layout)
iplot(fig)

### Distribution of employees by salary level in each department

In [4]:
import plotly.express as px
dat = data.groupby(['Department', 'salary']).size().reset_index()
dat = dat.rename(columns={0: 'Amount of workers'})
fig=px.bar(dat, x='Department', y='Amount of workers', color='salary')
iplot(fig)

### Distribution of retired and remaining employees by the level of salaries

In [5]:
fig=px.bar(data.groupby(['left', 'salary']).size().reset_index(), x='left', y=0, color='salary')
iplot(fig)

### Max, min and mean satisfaction level of retired and remaining employees

In [6]:
fig = go.Figure(data=[
    go.Bar(name='Left', x=['max', 'min', 'mean'], y=[data[data.left==1].satisfaction_level.max(),
                                                     data[data.left==1].satisfaction_level.min(),
                                                     data[data.left==1].satisfaction_level.mean()]),
    go.Bar(name='Not left', x=['max', 'min', 'mean'], y=[data[data.left==0].satisfaction_level.max(),
                                                         data[data.left==0].satisfaction_level.min(),
                                                         data[data.left==0].satisfaction_level.mean()])
])
# Change the bar mode
fig.update_layout(barmode='group')
iplot(fig)

### Replacing categorical data for further analysis (getting dummies for departments and changing low, medium and high salary to 0, 1, 1 (we should allocate a small salary, becouse majority of left employees have it)

In [7]:
dep = pd.get_dummies(data['Department'], prefix='dep')
data=pd.concat([data, dep], axis=1)
data.drop(['Department'], axis=1, inplace=True)
data['salary']=data['salary'].replace({'low':0, 'medium':1, 'high':1})
data.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,salary,dep_IT,dep_RandD,dep_accounting,dep_hr,dep_management,dep_marketing,dep_product_mng,dep_sales,dep_support,dep_technical
0,0.38,0.53,2,157,3,0,1,0,0,0,0,0,0,0,0,0,1,0,0
1,0.8,0.86,5,262,6,0,1,0,1,0,0,0,0,0,0,0,1,0,0
2,0.11,0.88,7,272,4,0,1,0,1,0,0,0,0,0,0,0,1,0,0
3,0.72,0.87,5,223,5,0,1,0,0,0,0,0,0,0,0,0,1,0,0
4,0.37,0.52,2,159,3,0,1,0,0,0,0,0,0,0,0,0,1,0,0


#### Separation of the dataset with a sign of dismissal from the employee’s signs

In [9]:
y=pd.DataFrame(data['left'])
data=data.drop(['left'], axis=1)

#### Separation of the sample into training and test

In [10]:
X_train, X_test, y_train, y_test=train_test_split(data, y, test_size=0.33, random_state=100)

### Trying Logistic regression

In [11]:
model = LogisticRegression()

model.fit(X_train, y_train)
expected=y_test
predicted=model.predict(X_test)
print(metrics.confusion_matrix(expected, predicted))
print(model.score(X_train, y_train.values.ravel()))
print(model.score(X_test, y_test.values.ravel()))


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().



[[3464  282]
 [ 793  411]]
0.787939098417753
0.7828282828282829



lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression



#### It has only 78% accuracy. Look at the confusin matrix

In [12]:
fig = px.imshow(metrics.confusion_matrix(expected, predicted),
                labels=dict(x="Predicted", y="Actual", color="Amount"))
iplot(fig)

#### Trying a separate vector machine, but before it let's the best hyperparameters with GridSearchTV

In [13]:
param_dist = {"C":[0.1, 1, 10, 100],
              "gamma":[1, 0.1, 0.01, 0.001],
              "kernel": ['rbf']}
modell = SVC()
modell_cv = GridSearchCV(modell, param_dist, verbose=3, refit=True)
modell_cv.fit(X_train, y_train.values.ravel())
print("Tuned SVC Parameters: {}".format(modell_cv.best_params_))
print("Best score is {}".format(modell_cv.best_score_))

Fitting 5 folds for each of 16 candidates, totalling 80 fits
[CV] C=0.1, gamma=1, kernel=rbf ......................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] .......... C=0.1, gamma=1, kernel=rbf, score=0.786, total=  14.1s
[CV] C=0.1, gamma=1, kernel=rbf ......................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   14.0s remaining:    0.0s


[CV] .......... C=0.1, gamma=1, kernel=rbf, score=0.788, total=  33.0s
[CV] C=0.1, gamma=1, kernel=rbf ......................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   47.0s remaining:    0.0s


[CV] .......... C=0.1, gamma=1, kernel=rbf, score=0.786, total=  30.7s
[CV] C=0.1, gamma=1, kernel=rbf ......................................
[CV] .......... C=0.1, gamma=1, kernel=rbf, score=0.783, total=  14.2s
[CV] C=0.1, gamma=1, kernel=rbf ......................................
[CV] .......... C=0.1, gamma=1, kernel=rbf, score=0.782, total=  18.6s
[CV] C=0.1, gamma=0.1, kernel=rbf ....................................
[CV] ........ C=0.1, gamma=0.1, kernel=rbf, score=0.929, total=   8.0s
[CV] C=0.1, gamma=0.1, kernel=rbf ....................................
[CV] ........ C=0.1, gamma=0.1, kernel=rbf, score=0.923, total=   6.4s
[CV] C=0.1, gamma=0.1, kernel=rbf ....................................
[CV] ........ C=0.1, gamma=0.1, kernel=rbf, score=0.919, total=   4.0s
[CV] C=0.1, gamma=0.1, kernel=rbf ....................................
[CV] ........ C=0.1, gamma=0.1, kernel=rbf, score=0.927, total=   5.0s
[CV] C=0.1, gamma=0.1, kernel=rbf ....................................
[CV] .

[CV] .......... C=100, gamma=1, kernel=rbf, score=0.951, total=  11.2s
[CV] C=100, gamma=1, kernel=rbf ......................................
[CV] .......... C=100, gamma=1, kernel=rbf, score=0.953, total=  10.5s
[CV] C=100, gamma=1, kernel=rbf ......................................
[CV] .......... C=100, gamma=1, kernel=rbf, score=0.942, total=  10.9s
[CV] C=100, gamma=1, kernel=rbf ......................................
[CV] .......... C=100, gamma=1, kernel=rbf, score=0.956, total=  11.2s
[CV] C=100, gamma=1, kernel=rbf ......................................
[CV] .......... C=100, gamma=1, kernel=rbf, score=0.951, total=  11.2s
[CV] C=100, gamma=0.1, kernel=rbf ....................................
[CV] ........ C=100, gamma=0.1, kernel=rbf, score=0.954, total=   2.3s
[CV] C=100, gamma=0.1, kernel=rbf ....................................
[CV] ........ C=100, gamma=0.1, kernel=rbf, score=0.962, total=   2.4s
[CV] C=100, gamma=0.1, kernel=rbf ....................................
[CV] .

[Parallel(n_jobs=1)]: Done  80 out of  80 | elapsed:  9.0min finished


Tuned SVC Parameters: {'C': 100, 'gamma': 0.01, 'kernel': 'rbf'}
Best score is 0.9620860357248106


#### Filling the SVM model with parameters selected by GridSearchCV

In [14]:
modell = SVC(C=100, gamma=0.01, kernel='rbf')
modell.fit(X_train, y_train)
print(modell)
predicted = modell.predict(X_test)
expected=y_test
print(metrics.confusion_matrix(expected, predicted))
print(modell.score(X_train, y_train))
print(modell.score(X_test, y_test))


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().



SVC(C=100, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.01, kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)
[[3634  112]
 [  84 1120]]
0.9711414071051846
0.9604040404040404


#### Although we see an overfitting, SVM perfectly copes with generalization -> 96% accuracy on test split. See the confusion matrix

In [15]:
fig = px.imshow(metrics.confusion_matrix(expected, predicted),
                labels=dict(x="Predicted", y="Actual", color="Amount"))
iplot(fig)

#### Trying Decision Tree Classifier, but before it let's the best hyperparameters with RandomizedSearchTV.

In [16]:
param_dist = {"max_depth": [3, None],
              "max_features": randint(1,9),
              "min_samples_leaf": randint(1,9),
              "criterion": ["gini", "entropy"]}
tree = DecisionTreeClassifier()
tree_cv = RandomizedSearchCV(tree, param_dist, cv = 5)
tree_cv.fit(X_train, y_train)
print("Tuned Decision Tree Parameters: {}".format(tree_cv.best_params_))
print("Best score is {}".format(tree_cv.best_score_))

Tuned Decision Tree Parameters: {'criterion': 'entropy', 'max_depth': None, 'max_features': 8, 'min_samples_leaf': 1}
Best score is 0.9763159810702584


#### Filling the Decision Tree model with parameters selected by RandomizedSearchCV

In [17]:
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier(criterion='entropy', max_depth=None, max_features=8, min_samples_leaf=1)
tree.fit(X_train, y_train)
predictio=tree.predict(X_test)
v=metrics.confusion_matrix(expected, predictio)
print(v)
print(tree.score(X_train, y_train))
print(tree.score(X_test, y_test))

[[3679   67]
 [  47 1157]]
1.0
0.9769696969696969


#### Although we see an overfitting, Decision Tree perfectly copes with generalization -> 97.3% accuracy on test split. See the confusion matrix

In [18]:
fig = px.imshow(v,labels=dict(x="Predicted", y="Actual", color="Amount"))
iplot(fig)

#### Trying Random Forest Classifier, but before it let's the best hyperparameters with RandomizedSearchTV.


In [19]:
from sklearn.ensemble import RandomForestClassifier
param_dist = {"n_estimators": randint(100,300),
              "criterion": ["gini", "entropy"],
              "max_depth": [3, None],
              "max_features": randint(1,9),
              "min_samples_leaf": randint(1,9)
              }
rand=RandomForestClassifier()
rand_cv = RandomizedSearchCV(rand, param_dist, cv = 5)
rand_cv.fit(X_train, y_train.values.ravel())
print("Tuned Random Forest Classifier Parameters: {}".format(rand_cv.best_params_))
print("Best score is {}".format(rand_cv.best_score_))

Tuned Random Forest Classifier Parameters: {'criterion': 'entropy', 'max_depth': None, 'max_features': 3, 'min_samples_leaf': 1, 'n_estimators': 213}
Best score is 0.9867648318883431


#### Filling the RandomForest model with parameters selected by RandomizedSearchCV

In [20]:
rand=RandomForestClassifier(criterion='entropy', max_depth=None, max_features=3, min_samples_leaf=1, n_estimators=213)
rand.fit(X_train, y_train)
pred=rand.predict(X_test)
exp=y_test
vv=metrics.confusion_matrix(exp, pred)
print(vv)
print(rand.score(X_train, y_train))
print(rand.score(X_test, y_test))


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().



[[3736   10]
 [  40 1164]]
1.0
0.98989898989899


#### Although we see an overfitting, Random Forest perfectly copes with generalization -> 98.9% accuracy on test split. See the confusion matrix

In [21]:
fig = px.imshow(vv,labels=dict(x="Predicted", y="Actual", color="Amount"))
iplot(fig)

#### Here we can see importance of features: satisfaction level is the most important (28%), time spend company is the second one (20%), average montly hours is the third (16%), number of projects is forth (15%), last evaluation is the fifth (13%). The importance of the remaining symptoms is less than one percent

In [22]:
for name, importance in zip(data.columns, rand.feature_importances_):
    print(name, "=", importance)

satisfaction_level = 0.28696350877815735
last_evaluation = 0.13574423720294396
number_project = 0.14833327029869955
average_montly_hours = 0.16445224653726528
time_spend_company = 0.20674154401252845
Work_accident = 0.013049207745265861
promotion_last_5years = 0.0030603865086748475
salary = 0.011303727743118412
dep_IT = 0.0027903977920144657
dep_RandD = 0.0023442358350458424
dep_accounting = 0.002417564141139854
dep_hr = 0.0027115658672559875
dep_management = 0.0025527540825358666
dep_marketing = 0.001970754817827681
dep_product_mng = 0.001849372714192187
dep_sales = 0.004942227739272244
dep_support = 0.0036772523505152342
dep_technical = 0.005095745833546969


## Summary
### Random Forest Classifier is the most efficient model for this situation, 99% accuracy. leaving the company is more dependent on employee satisfaction