In [1]:
import pandas as pd

In [2]:
hr_data = pd.read_csv('HR_comma_sep.csv')

In [4]:
hr_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14999 entries, 0 to 14998
Data columns (total 10 columns):
satisfaction_level       14999 non-null float64
last_evaluation          14999 non-null float64
number_project           14999 non-null int64
average_montly_hours     14999 non-null int64
time_spend_company       14999 non-null int64
Work_accident            14999 non-null int64
left                     14999 non-null int64
promotion_last_5years    14999 non-null int64
sales                    14999 non-null object
salary                   14999 non-null object
dtypes: float64(2), int64(6), object(2)
memory usage: 1.1+ MB


In [5]:
hr_data.rename(columns={'sales':'department'},inplace=True)

In [6]:
hr_data.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,department,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.8,0.86,5,262,6,0,1,0,sales,medium
2,0.11,0.88,7,272,4,0,1,0,sales,medium
3,0.72,0.87,5,223,5,0,1,0,sales,low
4,0.37,0.52,2,159,3,0,1,0,sales,low


In [7]:
### Featurization - convert string to numbers

In [8]:
hr_data.department.unique()

array(['sales', 'accounting', 'hr', 'technical', 'support', 'management',
       'IT', 'product_mng', 'marketing', 'RandD'], dtype=object)

In [9]:
hr_data.salary.unique()

array(['low', 'medium', 'high'], dtype=object)

In [10]:
from sklearn.preprocessing import LabelEncoder
for col in ['department','salary']:
    le = LabelEncoder()
    hr_data[col] = le.fit_transform(hr_data[col])

In [11]:
hr_data.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,department,salary
0,0.38,0.53,2,157,3,0,1,0,7,1
1,0.8,0.86,5,262,6,0,1,0,7,2
2,0.11,0.88,7,272,4,0,1,0,7,2
3,0.72,0.87,5,223,5,0,1,0,7,1
4,0.37,0.52,2,159,3,0,1,0,7,1


In [12]:
hr_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14999 entries, 0 to 14998
Data columns (total 10 columns):
satisfaction_level       14999 non-null float64
last_evaluation          14999 non-null float64
number_project           14999 non-null int64
average_montly_hours     14999 non-null int64
time_spend_company       14999 non-null int64
Work_accident            14999 non-null int64
left                     14999 non-null int64
promotion_last_5years    14999 non-null int64
department               14999 non-null int64
salary                   14999 non-null int64
dtypes: float64(2), int64(8)
memory usage: 1.1 MB


In [15]:
#Correlation between left & other features
type(hr_data.corr().loc['left'])

pandas.core.series.Series

In [16]:
hr_data.corr().loc['left']

satisfaction_level      -0.388375
last_evaluation          0.006567
number_project           0.023787
average_montly_hours     0.071287
time_spend_company       0.144822
Work_accident           -0.154622
left                     1.000000
promotion_last_5years   -0.061788
department               0.032105
salary                  -0.001294
Name: left, dtype: float64

In [17]:
fea_corr = hr_data.corr().loc['left']

In [18]:
fea_corr.sort_values()

satisfaction_level      -0.388375
Work_accident           -0.154622
promotion_last_5years   -0.061788
salary                  -0.001294
last_evaluation          0.006567
number_project           0.023787
department               0.032105
average_montly_hours     0.071287
time_spend_company       0.144822
left                     1.000000
Name: left, dtype: float64

In [19]:
import numpy as np

In [21]:
np.abs(fea_corr).sort_values()

salary                   0.001294
last_evaluation          0.006567
number_project           0.023787
department               0.032105
promotion_last_5years    0.061788
average_montly_hours     0.071287
time_spend_company       0.144822
Work_accident            0.154622
satisfaction_level       0.388375
left                     1.000000
Name: left, dtype: float64

In [35]:
### Spliting data into feature & target. Also, train & test

In [27]:
hr_data_x = hr_data[['satisfaction_level','last_evaluation','number_project','average_montly_hours','time_spend_company','Work_accident','promotion_last_5years','department','salary']]

In [28]:
#target
hr_data_y = hr_data['left']

In [32]:
from sklearn.model_selection import train_test_split
trainX,testX,trainY,testY = train_test_split(hr_data_x,hr_data_y)

In [34]:
testY.shape

(3750,)

In [None]:
### Apply Machine Learning Classification algos

In [22]:
from sklearn.linear_model import LogisticRegression 

In [23]:
from sklearn.naive_bayes import GaussianNB,MultinomialNB

In [24]:
from sklearn.ensemble import GradientBoostingClassifier,RandomForestClassifier
#needs 

In [40]:
base_models = [ LogisticRegression(), GaussianNB() , MultinomialNB() ]
base_models_name = ['LogisticRegression','GaussianNB','MultinomialNB']

In [37]:
from sklearn.metrics import accuracy_score

In [41]:
for idx,model in enumerate(base_models):
    model.fit(trainX,trainY)
    y_pred = model.predict(testX)
    print (base_models_name[idx],accuracy_score(testY,y_pred))

LogisticRegression 0.766666666667
GaussianNB 0.7848
MultinomialNB 0.767466666667


In [60]:
### Feature generation function
def feature_gen_func(data,feature_cols,target_col):
    feature_data = data[feature_cols]
    target_data = data[target_col]
    trainX,testX,trainY,testY = train_test_split(feature_data,target_data)
    return trainX,testX,trainY,testY

In [61]:
trainX,testX,trainY,testY = feature_gen_func(hr_data,['satisfaction_level','number_project','average_montly_hours','time_spend_company','Work_accident','promotion_last_5years'],'left')

In [62]:
trainY.shape

(11249,)

In [63]:
def process_models(hr_data, base_models,base_models_name,feature_cols,target):
    trainX,testX,trainY,testY = feature_gen_func(hr_data,feature_cols,target)
    for idx,model in enumerate(base_models):
        model.fit(trainX,trainY)
        y_pred = model.predict(testX)
        print (base_models_name[idx],accuracy_score(testY,y_pred))

In [65]:
process_models(hr_data, base_models, base_models_name, ['satisfaction_level','number_project','average_montly_hours','time_spend_company','promotion_last_5years'],'left')

GradientBoostingClassifier 0.969066666667
RandomForestClassifier 0.986133333333


In [66]:
base_models = [GradientBoostingClassifier(n_estimators=10),RandomForestClassifier(n_estimators=10)]

In [67]:
base_models_name = ['GradientBoostingClassifier','RandomForestClassifier']

In [69]:
process_models(hr_data,base_models,base_models_name,['satisfaction_level','number_project','average_montly_hours','time_spend_company','promotion_last_5years'],'left')

GradientBoostingClassifier 0.968266666667
RandomForestClassifier 0.983733333333
