In [1]:
import pandas as pd

In [3]:
emp_data = pd.read_csv('HR_comma_sep.csv')

In [4]:
emp_data.columns

Index(['satisfaction_level', 'last_evaluation', 'number_project',
       'average_montly_hours', 'time_spend_company', 'Work_accident', 'left',
       'promotion_last_5years', 'sales', 'salary'],
      dtype='object')

In [8]:
emp_data.rename(columns={'sales':'department'},inplace=True)

In [10]:
len(emp_data)

14999

In [12]:
#people who left
len(emp_data[emp_data.left == 1])

3571

In [13]:
emp_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14999 entries, 0 to 14998
Data columns (total 10 columns):
satisfaction_level       14999 non-null float64
last_evaluation          14999 non-null float64
number_project           14999 non-null int64
average_montly_hours     14999 non-null int64
time_spend_company       14999 non-null int64
Work_accident            14999 non-null int64
left                     14999 non-null int64
promotion_last_5years    14999 non-null int64
department               14999 non-null object
salary                   14999 non-null object
dtypes: float64(2), int64(6), object(2)
memory usage: 1.1+ MB


### Featurization - Converting String to numbers

In [15]:
from sklearn.preprocessing import LabelEncoder

In [16]:
le = LabelEncoder()

In [18]:
emp_data['salary'] = le.fit_transform(emp_data.salary)

In [19]:
emp_data['department'] = le.fit_transform(emp_data.department)

### Finding important features/columns
* We will use RandomForest for feature identification

In [92]:
data = emp_data.loc[:,emp_data.columns != 'left']

In [93]:
pred = emp_data['left']

In [32]:
from sklearn.ensemble import RandomForestClassifier

In [33]:
model = RandomForestClassifier(n_estimators=1000)

In [34]:
pred.values

array([1, 1, 1, ..., 1, 1, 1], dtype=int64)

In [36]:
model.fit(data,pred.values)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [37]:
model.feature_importances_

array([ 0.34313806,  0.11872019,  0.18036941,  0.14633834,  0.18475989,
        0.0055402 ,  0.0009203 ,  0.01224341,  0.00797019])

In [38]:
data.columns

Index(['satisfaction_level', 'last_evaluation', 'number_project',
       'average_montly_hours', 'time_spend_company', 'Work_accident',
       'promotion_last_5years', 'department', 'salary'],
      dtype='object')

In [39]:
#First 5 features are important for training

In [90]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier,RandomForestClassifier,AdaBoostClassifier

from sklearn.model_selection import train_test_split

models_name = ["RF", "GBC", "LR","ABC"]
models = [RandomForestClassifier(n_estimators=100),GradientBoostingClassifier(n_estimators=100),LogisticRegression(), AdaBoostClassifier(n_estimators=100)]

In [94]:
trainX, testX, trainY, testY = train_test_split(data,pred)

In [56]:
from sklearn.metrics import accuracy_score

In [95]:
trainX.shape

(11249, 9)

In [58]:
testX.shape

(3750, 9)

In [96]:
# With all features
i = 0
for model in models:
    
    model.fit(trainX,trainY)
    prediction = model.predict(testX)
    accuracy = accuracy_score(testY,prediction)
    print (models_name[i] + ' ' + str(accuracy))
    i += 1
    

RF 0.990133333333
GBC 0.973333333333
LR 0.765866666667
ABC 0.956533333333


In [67]:
#Using only important features

In [84]:
data = data[['satisfaction_level', 'last_evaluation', 'number_project',
       'average_montly_hours', 'time_spend_company']]

In [73]:
 data.columns.tolist()[:5]

['satisfaction_level',
 'last_evaluation',
 'number_project',
 'average_montly_hours',
 'time_spend_company']

In [85]:
trainX, testX, trainY, testY = train_test_split(data,pred)

In [91]:
# With 5 imp features
i = 0
for model in models:
    
    model.fit(trainX,trainY)
    prediction = model.predict(testX)
    accuracy = accuracy_score(testY,prediction)
    print (models_name[i] + ' ' + str(accuracy))
    i += 1

RF 0.990133333333
GBC 0.972
LR 0.761866666667
ABC 0.9616


In [82]:
trainX.shape

(11249, 6)

In [83]:
trainX

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident
1040,0.11,0.89,6,309,4,0
1259,0.45,0.48,2,138,3,0
14190,0.52,0.55,5,174,3,1
6140,0.98,0.77,4,184,3,0
1976,0.37,0.47,2,159,3,1
10209,0.50,0.80,4,261,3,0
8966,0.51,0.91,4,194,2,0
4046,0.71,0.96,4,144,4,0
1721,0.26,0.46,2,242,3,0
10751,0.84,0.96,6,155,5,0
