In [18]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler

from sklearn.linear_model import LogisticRegression

In [19]:
data = pd.read_csv('../input/employee-attrition-data/MFG10YearTerminationData.csv')

In [20]:
data

Unnamed: 0,EmployeeID,recorddate_key,birthdate_key,orighiredate_key,terminationdate_key,age,length_of_service,city_name,department_name,job_title,store_name,gender_short,gender_full,termreason_desc,termtype_desc,STATUS_YEAR,STATUS,BUSINESS_UNIT
0,1318,12/31/2006 0:00,1/3/1954,8/28/1989,1/1/1900,52,17,Vancouver,Executive,CEO,35,M,Male,Not Applicable,Not Applicable,2006,ACTIVE,HEADOFFICE
1,1318,12/31/2007 0:00,1/3/1954,8/28/1989,1/1/1900,53,18,Vancouver,Executive,CEO,35,M,Male,Not Applicable,Not Applicable,2007,ACTIVE,HEADOFFICE
2,1318,12/31/2008 0:00,1/3/1954,8/28/1989,1/1/1900,54,19,Vancouver,Executive,CEO,35,M,Male,Not Applicable,Not Applicable,2008,ACTIVE,HEADOFFICE
3,1318,12/31/2009 0:00,1/3/1954,8/28/1989,1/1/1900,55,20,Vancouver,Executive,CEO,35,M,Male,Not Applicable,Not Applicable,2009,ACTIVE,HEADOFFICE
4,1318,12/31/2010 0:00,1/3/1954,8/28/1989,1/1/1900,56,21,Vancouver,Executive,CEO,35,M,Male,Not Applicable,Not Applicable,2010,ACTIVE,HEADOFFICE
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49648,8258,12/1/2015 0:00,5/28/1994,8/19/2013,12/30/2015,21,2,Valemount,Dairy,Dairy Person,34,M,Male,Layoff,Involuntary,2015,TERMINATED,STORES
49649,8264,8/1/2013 0:00,6/13/1994,8/27/2013,8/30/2013,19,0,Vancouver,Customer Service,Cashier,44,F,Female,Resignaton,Voluntary,2013,TERMINATED,STORES
49650,8279,12/1/2015 0:00,7/18/1994,9/15/2013,12/30/2015,21,2,White Rock,Customer Service,Cashier,39,F,Female,Layoff,Involuntary,2015,TERMINATED,STORES
49651,8296,12/1/2013 0:00,9/2/1994,10/9/2013,12/31/2013,19,0,Kelowna,Customer Service,Cashier,16,F,Female,Resignaton,Voluntary,2013,TERMINATED,STORES


In [21]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49653 entries, 0 to 49652
Data columns (total 18 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   EmployeeID           49653 non-null  int64 
 1   recorddate_key       49653 non-null  object
 2   birthdate_key        49653 non-null  object
 3   orighiredate_key     49653 non-null  object
 4   terminationdate_key  49653 non-null  object
 5   age                  49653 non-null  int64 
 6   length_of_service    49653 non-null  int64 
 7   city_name            49653 non-null  object
 8   department_name      49653 non-null  object
 9   job_title            49653 non-null  object
 10  store_name           49653 non-null  int64 
 11  gender_short         49653 non-null  object
 12  gender_full          49653 non-null  object
 13  termreason_desc      49653 non-null  object
 14  termtype_desc        49653 non-null  object
 15  STATUS_YEAR          49653 non-null  int64 
 16  STAT

# **Initial Preprocessing**

In [22]:
def preprocess_input(df):
    df = df.copy()
    
    #drop unnecessary columns
    df = df.drop(['EmployeeID', 'gender_short'], axis = 1)
    
    #drop columns not available
    df = df.drop(['terminationdate_key','length_of_service','termreason_desc','termtype_desc'], axis=1) 
    
    #split into x and y
    y = df['STATUS']
    X = df.drop('STATUS', axis=1)
    
    #train test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=1)
    return X_train, X_test, y_train, y_test

In [23]:
X_train, X_test, y_train, y_test = preprocess_input(data)

# Pipeline

In [31]:
#extract year month and day features from datetype
class DateTransformer:
    def fit(self, X,y):
        return self
    
    def transform(self, X):
        for column in X.columns:
            X[column]=pd.to_datetime(X[column])
            X[column + '_year'] = X[column].apply(lambda x:x.year)
            X[column + '_month'] = X[column].apply(lambda x:x.month)
            X[column + '_day'] = X[column].apply(lambda x:x.day)
            X = X.drop(column, axis=1)
        return X
        

In [32]:
#features type
nominal_features = ['city_name','department_name','job_title']
binary_features =  ['gender_full','BUSINESS_UNIT']
date_features = ['recorddate_key','birthdate_key', 'orighiredate_key']

#transform features
binary_transformer = Pipeline(steps=[
    ('ordinal', OrdinalEncoder(categories='auto'))
])
nominal_transformer = Pipeline(steps=[
    ('nominal', OneHotEncoder())
])
date_transformer = Pipeline(steps=[
    ('date', DateTransformer())
])

In [33]:
#build a preprocessing transformer with ColumnTransformer
preprocessor = ColumnTransformer(transformers=[
    ('binary', binary_transformer, binary_features),
    ('nominal', nominal_transformer, nominal_features),
    ('date', date_transformer, date_features)
], sparse_threshold=0)

In [34]:
#final pipeline
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('scaler', StandardScaler()),
    ('classifier', LogisticRegression())
    
])

# Training

In [36]:
model.fit(X_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(sparse_threshold=0,
                                   transformers=[('binary',
                                                  Pipeline(steps=[('ordinal',
                                                                   OrdinalEncoder())]),
                                                  ['gender_full',
                                                   'BUSINESS_UNIT']),
                                                 ('nominal',
                                                  Pipeline(steps=[('nominal',
                                                                   OneHotEncoder())]),
                                                  ['city_name',
                                                   'department_name',
                                                   'job_title']),
                                                 ('date',
                                                  Pipeline(steps=[

# Results

In [39]:
acc = model.score(X_test,y_test)
print('Test accuracy: {:.2f}% '.format(acc*100))

Test accuracy: 100.00% 
