In [323]:
import pandas as pd
import numpy as np

In [324]:
# Load the data
hr_df = pd.read_csv( 'HR_comma_sep.csv' )

In [325]:
hr_df.columns

Index(['satisfaction_level', 'last_evaluation', 'number_project',
       'average_montly_hours', 'time_spend_company', 'Work_accident', 'left',
       'promotion_last_5years', 'department', 'salary'],
      dtype='object')

In [327]:
#missings
hr_df.isnull().any().sum()

0

In [328]:
hr_df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
satisfaction_level,14999.0,0.612834,0.248631,0.09,0.44,0.64,0.82,1.0
last_evaluation,14999.0,0.716102,0.171169,0.36,0.56,0.72,0.87,1.0
number_project,14999.0,3.803054,1.232592,2.0,3.0,4.0,5.0,7.0
average_montly_hours,14999.0,201.050337,49.943099,96.0,156.0,200.0,245.0,310.0
time_spend_company,14999.0,3.498233,1.460136,2.0,3.0,3.0,4.0,10.0
Work_accident,14999.0,0.14461,0.351719,0.0,0.0,0.0,0.0,1.0
left,14999.0,0.238083,0.425924,0.0,0.0,0.0,0.0,1.0
promotion_last_5years,14999.0,0.021268,0.144281,0.0,0.0,0.0,0.0,1.0


The summary statistics for Work_accident, left and promotion_last_5years does not make sense, as they are categorical variables

### PREDICTIVE MODEL: Build a model to predict if an employee will leave the company

In [329]:
# Encoding Categorical Features
numerical_features = ['satisfaction_level', 'last_evaluation', 'number_project',
     'average_montly_hours', 'time_spend_company']

categorical_features = ['Work_accident','promotion_last_5years', 'department', 'salary']

In [330]:
# An utility function to create dummy variable
def create_dummies( df, colname ):
    col_dummies = pd.get_dummies(df[colname], prefix=colname)
    col_dummies.drop(col_dummies.columns[0], axis=1, inplace=True)
    df = pd.concat([df, col_dummies], axis=1)
    df.drop( colname, axis = 1, inplace = True )
    return df

In [331]:
for c_feature in categorical_features:
    hr_df = create_dummies( hr_df, c_feature )

In [332]:
hr_df.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,left,Work_accident_1,promotion_last_5years_1,department_RandD,department_accounting,department_hr,department_management,department_marketing,department_product_mng,department_sales,department_support,department_technical,salary_low,salary_medium
0,0.38,0.53,2,157,3,1,0,0,0,0,0,0,0,0,1,0,0,1,0
1,0.8,0.86,5,262,6,1,0,0,0,0,0,0,0,0,1,0,0,0,1
2,0.11,0.88,7,272,4,1,0,0,0,0,0,0,0,0,1,0,0,0,1
3,0.72,0.87,5,223,5,1,0,0,0,0,0,0,0,0,1,0,0,1,0
4,0.37,0.52,2,159,3,1,0,0,0,0,0,0,0,0,1,0,0,1,0


In [333]:
#Splitting the data

feature_columns = hr_df.columns.difference( ['left'] )


In [335]:
hr_df.shape

(11991, 19)

In [337]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(hr_df,test_size=0.2,random_state=42)



In [338]:
sum(train.duplicated()==True)

0

In [339]:
train.shape

(9592, 19)

In [340]:
bs1 = train.sample(n=None,frac=1,replace=True,random_state=123)

bs2 = train.sample(n=None,frac=1,replace=True,random_state=124)

bs3 = train.sample(n=None,frac=1,replace=True,random_state=125)

bs4 = train.sample(n=None,frac=1,replace=True,random_state=126)

bs5 = train.sample(n=None,frac=1,replace=True,random_state=127)


In [341]:
sum(bs5.duplicated()==True)

3492

In [342]:
feature_columns = train.columns.difference(['left'])

In [343]:
feature_columns_test = test.columns.difference(['left'])

In [344]:
test_X = test[feature_columns_test]
test_y = test['left']

In [345]:
bs1_X = bs1[feature_columns]
bs1_y = bs1['left']

In [346]:
bs2_X = bs2[feature_columns]
bs2_y = bs2['left']

In [347]:
bs3_X = bs3[feature_columns]
bs3_y = bs3['left']

In [348]:
bs4_X = bs4[feature_columns]
bs4_y = bs4['left']

In [349]:
bs5_X = bs5[feature_columns]
bs5_y = bs5['left']

In [350]:
bs5_X.shape

(9592, 18)

### Building Decion Tree Model with Gini

In [351]:
import sklearn.tree as dt

from sklearn.tree import DecisionTreeClassifier, export_graphviz, export
from sklearn.model_selection import GridSearchCV

In [352]:
bs1_X.head()

Unnamed: 0,Work_accident_1,average_montly_hours,department_RandD,department_accounting,department_hr,department_management,department_marketing,department_product_mng,department_sales,department_support,department_technical,last_evaluation,number_project,promotion_last_5years_1,salary_low,salary_medium,satisfaction_level,time_spend_company
10225,0,139,1,0,0,0,0,0,0,0,0,0.89,3,0,0,1,0.73,3
5321,0,246,0,0,0,0,0,0,0,1,0,0.61,5,0,0,1,0.83,3
9733,0,178,0,0,0,0,0,0,0,0,1,0.59,3,0,0,1,0.56,2
7303,1,152,0,0,0,0,0,0,0,1,0,0.71,3,0,0,1,0.61,4
4904,0,232,0,0,0,0,0,0,1,0,0,0.85,4,0,0,1,0.7,3


In [353]:
clf_tree_gini = DecisionTreeClassifier(max_depth = 3)
clf_tree_gini.fit(bs1_X, bs1_y)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=3,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

### Building Decion Tree Model with Entropy

In [354]:
clf_tree_entropy = DecisionTreeClassifier(criterion='entropy', max_depth = 3)
clf_tree_entropy.fit(bs2_X, bs2_y)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=3,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

### Bagging Model

In [355]:
import sklearn.ensemble as en

from sklearn.ensemble import BaggingClassifier

In [356]:
bagclm = BaggingClassifier(oob_score=True, n_estimators = 100)
bagclm.fit(bs3_X,bs3_y)

BaggingClassifier(base_estimator=None, bootstrap=True,
         bootstrap_features=False, max_features=1.0, max_samples=1.0,
         n_estimators=100, n_jobs=1, oob_score=True, random_state=None,
         verbose=0, warm_start=False)

### Boosting Model

In [357]:
from sklearn.ensemble import AdaBoostClassifier

In [358]:
paramgrid_ada = {'n_estimators': [100,200,400], 'learning_rate':[10**x for x in range(-2,2)]}

The model is predicting the probability of him leaving the company is only 0.027, which is very low.

In [359]:
from sklearn.grid_search import GridSearchCV

gscv_ada = GridSearchCV(estimator=AdaBoostClassifier(), param_grid=paramgrid_ada,cv=5, verbose=False, n_jobs=-1)

In [360]:
gscv_ada.fit(bs4_X,bs4_y)

GridSearchCV(cv=5, error_score='raise',
       estimator=AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=None),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'n_estimators': [100, 200, 400], 'learning_rate': [0.01, 0.1, 1, 10]},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=False)

In [361]:
probdf_gini = pd.DataFrame(clf_tree_gini.predict_proba(bs5_X))

In [362]:
probdf_entropy = pd.DataFrame(clf_tree_entropy.predict_proba(bs5_X))

In [363]:
probdf_bagging = pd.DataFrame(bagclm.predict_proba(bs5_X))

In [364]:
probdf_boosting = pd.DataFrame(gscv_ada.predict_proba(bs5_X))

In [365]:
bs5_X.shape


(9592, 18)

In [366]:
probdf_gini.shape

(9592, 2)

In [367]:
probdf_entropy.shape

(9592, 2)

In [368]:
probdf_bagging.shape

(9592, 2)

In [369]:
probdf_boosting.shape

(9592, 2)

In [370]:
sum(bs5_X.duplicated())

3492

In [371]:
bs5_X.reset_index()

Unnamed: 0,index,Work_accident_1,average_montly_hours,department_RandD,department_accounting,department_hr,department_management,department_marketing,department_product_mng,department_sales,department_support,department_technical,last_evaluation,number_project,promotion_last_5years_1,salary_low,salary_medium,satisfaction_level,time_spend_company
0,2550,0,187,0,0,1,0,0,0,0,0,0,0.62,2,0,1,0,0.46,3
1,8246,0,225,0,1,0,0,0,0,0,0,0,0.70,3,0,1,0,0.61,3
2,4224,0,273,0,0,0,0,0,0,1,0,0,0.68,4,0,1,0,0.83,2
3,226,0,158,0,0,1,0,0,0,0,0,0,0.46,2,0,1,0,0.37,3
4,6368,0,131,0,0,1,0,0,0,0,0,0,0.95,3,0,0,1,0.64,6
5,598,0,157,0,0,0,0,0,0,0,0,0,0.46,2,0,1,0,0.37,3
6,2959,0,197,0,0,0,0,0,0,0,0,1,0.82,2,0,1,0,0.55,4
7,2728,0,152,0,0,0,0,0,0,0,0,1,0.49,3,0,0,1,0.55,2
8,4845,0,155,0,0,0,0,0,0,0,0,1,0.52,5,0,1,0,0.54,3
9,5549,0,242,0,0,0,0,0,0,0,1,0,0.53,4,0,0,1,0.81,3


In [372]:
bs5_X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9592 entries, 2550 to 9459
Data columns (total 18 columns):
Work_accident_1            9592 non-null uint8
average_montly_hours       9592 non-null int64
department_RandD           9592 non-null uint8
department_accounting      9592 non-null uint8
department_hr              9592 non-null uint8
department_management      9592 non-null uint8
department_marketing       9592 non-null uint8
department_product_mng     9592 non-null uint8
department_sales           9592 non-null uint8
department_support         9592 non-null uint8
department_technical       9592 non-null uint8
last_evaluation            9592 non-null float64
number_project             9592 non-null int64
promotion_last_5years_1    9592 non-null uint8
salary_low                 9592 non-null uint8
salary_medium              9592 non-null uint8
satisfaction_level         9592 non-null float64
time_spend_company         9592 non-null int64
dtypes: float64(2), int64(3), uint8(13)


In [373]:
mergedXProbadf = pd.concat([probdf_gini[1],probdf_entropy[1],probdf_bagging[1],probdf_boosting[1]],axis=1,sort=False)
mergedXProbadf.columns=['p(Gini)','p(Entropy)','p(Bagging)','p(Boosting)']
bs5_sample_mergeddf = pd.merge(bs5_X.reset_index(),mergedXProbadf,left_index=True,right_index=True)
#bs5_sample_mergeddf = pd.merge(bs5_X,mergedXProbadf,how='inner')

In [374]:
bs5_sample_mergeddf.drop(labels='index',axis=1,inplace=True)

In [375]:
bs5_sample_mergeddf.head()

Unnamed: 0,Work_accident_1,average_montly_hours,department_RandD,department_accounting,department_hr,department_management,department_marketing,department_product_mng,department_sales,department_support,...,number_project,promotion_last_5years_1,salary_low,salary_medium,satisfaction_level,time_spend_company,p(Gini),p(Entropy),p(Bagging),p(Boosting)
0,0,187,0,0,1,0,0,0,0,0,...,2,0,1,0,0.46,3,0.0,0.824475,0.0,0.498438
1,0,225,0,1,0,0,0,0,0,0,...,3,0,1,0,0.61,3,0.006916,0.009972,0.0,0.489915
2,0,273,0,0,0,0,0,0,1,0,...,4,0,1,0,0.83,2,0.006916,0.009972,0.0,0.493973
3,0,158,0,0,1,0,0,0,0,0,...,2,0,1,0,0.37,3,0.913462,0.824475,1.0,0.506301
4,0,131,0,0,1,0,0,0,0,0,...,3,0,0,1,0.64,6,0.703911,0.704731,0.0,0.495572


##### 

In [319]:
bs5_sample_mergeddf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8633 entries, 0 to 8632
Data columns (total 23 columns):
level_0                    8633 non-null int64
Work_accident_1            8633 non-null uint8
average_montly_hours       8633 non-null int64
department_RandD           8633 non-null uint8
department_accounting      8633 non-null uint8
department_hr              8633 non-null uint8
department_management      8633 non-null uint8
department_marketing       8633 non-null uint8
department_product_mng     8633 non-null uint8
department_sales           8633 non-null uint8
department_support         8633 non-null uint8
department_technical       8633 non-null uint8
last_evaluation            8633 non-null float64
number_project             8633 non-null int64
promotion_last_5years_1    8633 non-null uint8
salary_low                 8633 non-null uint8
salary_medium              8633 non-null uint8
satisfaction_level         8633 non-null float64
time_spend_company         8633 non-null in

In [318]:
mergedXProbadf.shape

(8633, 4)

In [284]:
#print(mergedXProbadf)

In [293]:
mergedXProbadf[mergedXProbadf.duplicated()==True]

Unnamed: 0,p(Gini),p(Entropy),p(Bagging),p(Boosting)
459,0.804318,0.808625,1.0,0.501339
553,0.804318,0.808625,1.0,0.502213
682,1.000000,1.000000,1.0,0.535934
725,1.000000,1.000000,1.0,0.557625
801,1.000000,1.000000,1.0,0.557724
868,1.000000,1.000000,1.0,0.535442
1165,0.804318,0.808625,1.0,0.501790
1203,0.804318,0.808625,1.0,0.501676
1224,0.804318,0.808625,1.0,0.502544
1292,0.804318,0.808625,1.0,0.501806


In [290]:
np.sum(mergedXProbadf.duplicated())

258

In [282]:
mergedXProbadf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8633 entries, 0 to 8632
Data columns (total 4 columns):
p(Gini)        8633 non-null float64
p(Entropy)     8633 non-null float64
p(Bagging)     8633 non-null float64
p(Boosting)    8633 non-null float64
dtypes: float64(4)
memory usage: 269.9 KB


In [276]:
bs5_X.shape

(8633, 18)

In [280]:
bs5_sample_mergeddf.shape

(6165, 22)

In [312]:
bs5_sample_mergeddf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8633 entries, 0 to 8632
Data columns (total 24 columns):
level_0                    8633 non-null int64
index                      8633 non-null int64
Work_accident_1            8633 non-null uint8
average_montly_hours       8633 non-null int64
department_RandD           8633 non-null uint8
department_accounting      8633 non-null uint8
department_hr              8633 non-null uint8
department_management      8633 non-null uint8
department_marketing       8633 non-null uint8
department_product_mng     8633 non-null uint8
department_sales           8633 non-null uint8
department_support         8633 non-null uint8
department_technical       8633 non-null uint8
last_evaluation            8633 non-null float64
number_project             8633 non-null int64
promotion_last_5years_1    8633 non-null uint8
salary_low                 8633 non-null uint8
salary_medium              8633 non-null uint8
satisfaction_level         8633 non-null floa

In [278]:
mergedXProbadf.head()

Unnamed: 0,p(Gini),p(Entropy),p(Bagging),p(Boosting)
0,0.009364,0.007774,0.0,0.494896
1,0.046463,0.045121,0.0,0.49573
2,0.046463,0.045121,0.07,0.497396
3,0.009364,0.007774,0.0,0.495446
4,0.009364,0.007774,0.0,0.475829


In [313]:
bs5_sample_mergeddf.head()

Unnamed: 0,level_0,index,Work_accident_1,average_montly_hours,department_RandD,department_accounting,department_hr,department_management,department_marketing,department_product_mng,...,number_project,promotion_last_5years_1,salary_low,salary_medium,satisfaction_level,time_spend_company,p(Gini),p(Entropy),p(Bagging),p(Boosting)
0,0,7522,1,143,0,0,0,0,0,0,...,5,0,0,0,0.82,2,0.009364,0.007774,0.0,0.494896
1,1,3574,0,276,0,0,0,0,0,0,...,4,0,0,1,0.27,2,0.046463,0.045121,0.0,0.49573
2,2,7780,0,138,1,0,0,0,0,0,...,4,0,1,0,0.31,2,0.046463,0.045121,0.07,0.497396
3,3,3211,1,175,0,0,0,1,0,0,...,4,0,1,0,0.58,3,0.009364,0.007774,0.0,0.495446
4,4,7384,0,139,0,0,0,0,0,0,...,2,0,0,0,0.53,3,0.009364,0.007774,0.0,0.475829


In [262]:
# mergedXProbadf = mergedXProbadf[mergedXProbadf.columns.difference(['index'])]
#mergedXProbadf.shape
bs5_X.shape

(8633, 18)

In [263]:
bs5_sample_mergeddf.shape

(8633, 22)

In [285]:
#print(probdf_gini)

In [265]:
probdf_entropy.head()

Unnamed: 0,0,1
0,0.992226,0.007774
1,0.954879,0.045121
2,0.954879,0.045121
3,0.992226,0.007774
4,0.992226,0.007774


In [266]:
probdf_bagging.head()

Unnamed: 0,0,1
0,1.0,0.0
1,1.0,0.0
2,0.93,0.07
3,1.0,0.0
4,1.0,0.0


In [267]:
probdf_boosting.head()

Unnamed: 0,0,1
0,0.505104,0.494896
1,0.50427,0.49573
2,0.502604,0.497396
3,0.504554,0.495446
4,0.524171,0.475829


In [376]:
from sklearn.ensemble import RandomForestClassifier

In [377]:
radm_clf = RandomForestClassifier(oob_score=True,n_estimators=100 )
radm_clf.fit( bs5_sample_mergeddf, bs5_y )

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=True, random_state=None, verbose=0, warm_start=False)

In [382]:
p1_gini=pd.DataFrame(clf_tree_gini.predict_proba(test_X))
p2_entropy= pd.DataFrame(clf_tree_entropy.predict_proba(test_X))
p3_bagging= pd.DataFrame(bagclm.predict_proba(test_X))
p4_boosting= pd.DataFrame(gscv_ada.predict_proba(test_X))
mergedtestProbadf = pd.concat([p1_gini[1],p2_entropy[1],p3_bagging[1],p4_boosting[1]],axis=1,sort=False)
mergedtestProbadf.columns=['p(Gini)','p(Entropy)','p(Bagging)','p(Boosting)']
test_sample_mergeddf = pd.merge(test_X.reset_index(),mergedtestProbadf,left_index=True,right_index=True)
#pd.concat()
#pd.merge(test_X)


In [383]:
test_sample_mergeddf.drop(labels='index',axis=1,inplace=True)

In [393]:
final_prob=radm_clf.predict(test_sample_mergeddf)
final_prob


array([1, 0, 0, ..., 0, 0, 0])

In [386]:
radm_test_pred = pd.DataFrame( { 'actual':  test_y,'predicted': radm_clf.predict( test_sample_mergeddf ) } )

#radm_clf.predict_proba(test_X)

In [388]:
from sklearn import metrics
metrics.accuracy_score( radm_test_pred.actual, radm_test_pred.predicted )

0.9779074614422676