In [1]:
import pandas as pd

In [7]:
hr_data = pd.read_csv('https://raw.githubusercontent.com/zekelabs/machine-learning-for-beginners/master/data/HR_comma_sep.csv')

In [10]:
hr_data.rename(columns={'sales':'dept'}, inplace=True)

In [27]:
hr_data.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,dept,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.8,0.86,5,262,6,0,1,0,sales,medium
2,0.11,0.88,7,272,4,0,1,0,sales,medium
3,0.72,0.87,5,223,5,0,1,0,sales,low
4,0.37,0.52,2,159,3,0,1,0,sales,low


### Featurization
* Dept is nominal
* salary is ordinal

In [12]:
num_cols = hr_data.select_dtypes(exclude=['object'])

In [241]:
num_cols.columns.tolist()

['satisfaction_level',
 'last_evaluation',
 'number_project',
 'average_montly_hours',
 'time_spend_company',
 'Work_accident',
 'left',
 'promotion_last_5years']

In [14]:
cat_cols = hr_data.select_dtypes('object')

In [228]:
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder,LabelEncoder, LabelBinarizer,StandardScaler

In [140]:
class ItemSelector(BaseEstimator, TransformerMixin):
    def __init__(self,key):
        self.key = key
        
    def fit(self,X,Y=None):
        return self
    
    def transform(self,X,Y=None):
        return X[self.key]
        

In [141]:
item_selector = ItemSelector('dept')

In [142]:
pipeline_dept = Pipeline([
    ('selector', ItemSelector('dept')),
    ('lb', MyLabelBinarizer()),
])

In [230]:
class MultiItemSelector(BaseEstimator, TransformerMixin):
    def __init__(self,keys):
        self.keys = keys
        
    def fit(self,X,Y=None):
        return self
    
    def transform(self,X,Y=None):
        return X[self.keys]

In [231]:
class SalaryMapper(BaseEstimator, TransformerMixin):
    
    def fit(self,X,Y=None):
        return self
    
    def transform(self,X,Y=None):
        
        db = {'low':1,'medium':2,'high':3}
        print (type(X))
        r = X.str.strip().replace(db)
        return r.values.reshape(-1,1)
        

In [232]:
pipeline_salary = Pipeline([
    ('selector',ItemSelector('salary')),
    ('sm',SalaryMapper())
])

In [242]:
pipeline_numbers = Pipeline([
    ('selector',MultiItemSelector(num_cols.columns.tolist())),
    ('scaling',StandardScaler())
])

In [243]:
fu = FeatureUnion([
    ('one',pipeline_dept),
    ('two',pipeline_salary),
    ('three',pipeline_numbers)
])

In [244]:
pipeline = Pipeline([
    ('union',fu)
])

In [245]:
pipeline.fit(hr_data,hr_data.left)

Pipeline(memory=None,
     steps=[('union', FeatureUnion(n_jobs=1,
       transformer_list=[('one', Pipeline(memory=None,
     steps=[('selector', ItemSelector(key='dept')), ('lb', <__main__.MyLabelBinarizer object at 0x0000027B53A4DEF0>)])), ('two', Pipeline(memory=None,
     steps=[('selector', ItemSelector(key='salary')), ('sm', SalaryMapper())])), ('three', Pipeline(m...promotion_last_5years'])), ('scaling', StandardScaler(copy=True, with_mean=True, with_std=True))]))],
       transformer_weights=None))])

In [246]:
pipeline.transform(hr_data)

<class 'pandas.core.series.Series'>


array([[ 0.        ,  0.        ,  0.        , ..., -0.41116529,
         1.788917  , -0.14741182],
       [ 0.        ,  0.        ,  0.        , ..., -0.41116529,
         1.788917  , -0.14741182],
       [ 0.        ,  0.        ,  0.        , ..., -0.41116529,
         1.788917  , -0.14741182],
       ...,
       [ 0.        ,  0.        ,  0.        , ..., -0.41116529,
         1.788917  , -0.14741182],
       [ 0.        ,  0.        ,  0.        , ..., -0.41116529,
         1.788917  , -0.14741182],
       [ 0.        ,  0.        ,  0.        , ..., -0.41116529,
         1.788917  , -0.14741182]])

In [206]:
lb.fit_transform(hr_data.dept)

array([[0, 0, 0, ..., 1, 0, 0],
       [0, 0, 0, ..., 1, 0, 0],
       [0, 0, 0, ..., 1, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 1, 0],
       [0, 0, 0, ..., 0, 1, 0],
       [0, 0, 0, ..., 0, 1, 0]])

In [133]:
class MyLabelBinarizer(TransformerMixin):
    def __init__(self, *args, **kwargs):
        self.encoder = LabelBinarizer(*args, **kwargs)
    def fit(self, x, y=0):
        self.encoder.fit(x)
        return self
    def transform(self, x, y=0):
        return self.encoder.transform(x)