In [1]:
import pandas as pd

In [2]:
hr_data = pd.read_csv('data/HR_comma_sep.csv')

In [4]:
hr_data.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,sales,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.8,0.86,5,262,6,0,1,0,sales,medium
2,0.11,0.88,7,272,4,0,1,0,sales,medium
3,0.72,0.87,5,223,5,0,1,0,sales,low
4,0.37,0.52,2,159,3,0,1,0,sales,low


In [8]:
### Rename col
hr_data.rename(columns={'sales':'dept'}, inplace=True)

In [9]:
hr_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14999 entries, 0 to 14998
Data columns (total 10 columns):
satisfaction_level       14999 non-null float64
last_evaluation          14999 non-null float64
number_project           14999 non-null int64
average_montly_hours     14999 non-null int64
time_spend_company       14999 non-null int64
Work_accident            14999 non-null int64
left                     14999 non-null int64
promotion_last_5years    14999 non-null int64
dept                     14999 non-null object
salary                   14999 non-null object
dtypes: float64(2), int64(6), object(2)
memory usage: 1.1+ MB


In [10]:
from sklearn_pandas import DataFrameMapper

In [13]:
from sklearn.preprocessing import MinMaxScaler,StandardScaler, LabelEncoder

In [17]:
mapper = DataFrameMapper([
    ('satisfaction_level',None),
    ('last_evaluation',None),
    (['number_project','average_montly_hours','time_spend_company'], StandardScaler()),
    ('Work_accident',None),
    ('promotion_last_5years',None),
    ('dept', LabelEncoder()),
    ('salary',LabelEncoder())
])

In [18]:
mapper

DataFrameMapper(default=False, df_out=False,
        features=[('satisfaction_level', None), ('last_evaluation', None), (['number_project', 'average_montly_hours', 'time_spend_company'], StandardScaler(copy=True, with_mean=True, with_std=True)), ('Work_accident', None), ('promotion_last_5years', None), ('dept', LabelEncoder()), ('salary', LabelEncoder())],
        input_df=False, sparse=False)

In [21]:
feature_data = mapper.fit_transform(hr_data)



In [23]:
hr_data.left[:5]

0    1
1    1
2    1
3    1
4    1
Name: left, dtype: int64

In [58]:
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest,f_classif
from sklearn.linear_model import LogisticRegression, SGDClassifier

In [27]:
pipe = Pipeline([
    ('mapper',mapper),
    ('select',SelectKBest(score_func=f_classif, k=7)),
    ('lr',LogisticRegression())
])

In [28]:
pipe

Pipeline(memory=None,
     steps=[('mapper', DataFrameMapper(default=False, df_out=False,
        features=[('satisfaction_level', None), ('last_evaluation', None), (['number_project', 'average_montly_hours', 'time_spend_company'], StandardScaler(copy=True, with_mean=True, with_std=True)), ('Work_accident', None), ('promotion...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

In [29]:
from sklearn.model_selection import train_test_split

In [31]:
trainX,testX,trainY,testY = train_test_split(hr_data.drop('left',axis=1),hr_data.left)

In [32]:
pipe.fit(trainX,trainY)



Pipeline(memory=None,
     steps=[('mapper', DataFrameMapper(default=False, df_out=False,
        features=[('satisfaction_level', None), ('last_evaluation', None), (['number_project', 'average_montly_hours', 'time_spend_company'], StandardScaler(copy=True, with_mean=True, with_std=True)), ('Work_accident', None), ('promotion...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

In [33]:
pipe.score(testX,testY)



0.76613333333333333

### Hyper-Parameter Tuning using Grid Search

In [42]:
params = {
    'select__k':[4,6,8,9],
    'lr__C':[.001,.01]
}

In [43]:
from sklearn.model_selection import GridSearchCV

In [44]:
grid = GridSearchCV(n_jobs=-1, param_grid=params, estimator=pipe)

In [45]:
grid.fit(trainX,trainY)



GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('mapper', DataFrameMapper(default=False, df_out=False,
        features=[('satisfaction_level', None), ('last_evaluation', None), (['number_project', 'average_montly_hours', 'time_spend_company'], StandardScaler(copy=True, with_mean=True, with_std=True)), ('Work_accident', None), ('promotion...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'select__k': [4, 6, 8, 9], 'lr__C': [0.001, 0.01]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [46]:
grid.best_params_

{'lr__C': 0.001, 'select__k': 6}

In [47]:
grid.best_score_

0.7596230776068984

In [48]:
model = grid.best_estimator_

In [50]:
model.predict(testX[:5])



array([0, 0, 0, 0, 0], dtype=int64)

In [51]:
from sklearn.model_selection import cross_val_score, KFold

In [55]:
acc = cross_val_score(model,hr_data,hr_data.left ,cv=5, n_jobs=-1, scoring='accuracy')

In [57]:
acc.mean()

0.72217089081898789