In [2]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler

from sklearn.linear_model import LogisticRegression


In [3]:
data=pd.read_csv('MFG10YearTerminationData.csv')

In [4]:
data.head()

Unnamed: 0,EmployeeID,recorddate_key,birthdate_key,orighiredate_key,terminationdate_key,age,length_of_service,city_name,department_name,job_title,store_name,gender_short,gender_full,termreason_desc,termtype_desc,STATUS_YEAR,STATUS,BUSINESS_UNIT
0,1318,12/31/2006 0:00,1/3/1954,8/28/1989,1/1/1900,52,17,Vancouver,Executive,CEO,35,M,Male,Not Applicable,Not Applicable,2006,ACTIVE,HEADOFFICE
1,1318,12/31/2007 0:00,1/3/1954,8/28/1989,1/1/1900,53,18,Vancouver,Executive,CEO,35,M,Male,Not Applicable,Not Applicable,2007,ACTIVE,HEADOFFICE
2,1318,12/31/2008 0:00,1/3/1954,8/28/1989,1/1/1900,54,19,Vancouver,Executive,CEO,35,M,Male,Not Applicable,Not Applicable,2008,ACTIVE,HEADOFFICE
3,1318,12/31/2009 0:00,1/3/1954,8/28/1989,1/1/1900,55,20,Vancouver,Executive,CEO,35,M,Male,Not Applicable,Not Applicable,2009,ACTIVE,HEADOFFICE
4,1318,12/31/2010 0:00,1/3/1954,8/28/1989,1/1/1900,56,21,Vancouver,Executive,CEO,35,M,Male,Not Applicable,Not Applicable,2010,ACTIVE,HEADOFFICE


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49653 entries, 0 to 49652
Data columns (total 18 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   EmployeeID           49653 non-null  int64 
 1   recorddate_key       49653 non-null  object
 2   birthdate_key        49653 non-null  object
 3   orighiredate_key     49653 non-null  object
 4   terminationdate_key  49653 non-null  object
 5   age                  49653 non-null  int64 
 6   length_of_service    49653 non-null  int64 
 7   city_name            49653 non-null  object
 8   department_name      49653 non-null  object
 9   job_title            49653 non-null  object
 10  store_name           49653 non-null  int64 
 11  gender_short         49653 non-null  object
 12  gender_full          49653 non-null  object
 13  termreason_desc      49653 non-null  object
 14  termtype_desc        49653 non-null  object
 15  STATUS_YEAR          49653 non-null  int64 
 16  STAT

# Initial Preprocessing

In [12]:
def preprocess_inputs(df):
    df=df.copy()
    # drop unnessecary columns
    df=df.drop(['EmployeeID','gender_short'],axis=1)
    
    #drop columns not available before termination
    df=df.drop(['terminationdate_key','length_of_service','termreason_desc','termtype_desc'],axis=1)
    X=df.drop(['STATUS'],axis=1)
    y=df['STATUS']
    
    #train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    return X_train, X_test, y_train, y_test

In [13]:
X_train, X_test, y_train, y_test=preprocess_inputs(data)

In [14]:
X_train

Unnamed: 0,recorddate_key,birthdate_key,orighiredate_key,age,city_name,department_name,job_title,store_name,gender_full,STATUS_YEAR,BUSINESS_UNIT
36271,12/31/2011 0:00,8/18/1979,10/30/2005,32,Abbotsford,Bakery,Baker,1,Male,2011,STORES
3950,12/31/2008 0:00,4/27/1953,12/8/1991,55,Vancouver,Produce,Produce Clerk,41,Female,2008,STORES
26963,12/31/2006 0:00,9/26/1971,8/29/2001,35,New Westminster,Bakery,Baker,21,Male,2006,STORES
29451,12/31/2011 0:00,10/16/1973,9/30/2002,38,Terrace,Processed Foods,Shelf Stocker,32,Male,2011,STORES
3790,12/31/2008 0:00,3/10/1953,11/13/1991,55,Burnaby,Meats,Meat Cutter,5,Male,2008,STORES
...,...,...,...,...,...,...,...,...,...,...,...
11284,12/31/2010 0:00,6/9/1959,3/2/1995,51,Vancouver,Meats,Meat Cutter,42,Male,2010,STORES
44732,12/31/2013 0:00,4/26/1988,6/1/2010,25,Terrace,Customer Service,Cashier,32,Female,2013,STORES
38158,12/31/2012 0:00,3/13/1981,8/28/2006,31,Nanaimo,Processed Foods,Shelf Stocker,18,Female,2012,STORES
860,12/31/2007 0:00,2/6/1950,3/27/1990,57,Kamloops,Processed Foods,Processed Foods Manager,15,Male,2007,STORES


# Building Pipeline

In [16]:
{column: len(X_train[column].unique()) for column in X_train.select_dtypes('object').columns}

{'recorddate_key': 130,
 'birthdate_key': 5276,
 'orighiredate_key': 4377,
 'city_name': 40,
 'department_name': 21,
 'job_title': 47,
 'gender_full': 2,
 'BUSINESS_UNIT': 2}

In [15]:
X_train.select_dtypes('object')

Unnamed: 0,recorddate_key,birthdate_key,orighiredate_key,city_name,department_name,job_title,gender_full,BUSINESS_UNIT
36271,12/31/2011 0:00,8/18/1979,10/30/2005,Abbotsford,Bakery,Baker,Male,STORES
3950,12/31/2008 0:00,4/27/1953,12/8/1991,Vancouver,Produce,Produce Clerk,Female,STORES
26963,12/31/2006 0:00,9/26/1971,8/29/2001,New Westminster,Bakery,Baker,Male,STORES
29451,12/31/2011 0:00,10/16/1973,9/30/2002,Terrace,Processed Foods,Shelf Stocker,Male,STORES
3790,12/31/2008 0:00,3/10/1953,11/13/1991,Burnaby,Meats,Meat Cutter,Male,STORES
...,...,...,...,...,...,...,...,...
11284,12/31/2010 0:00,6/9/1959,3/2/1995,Vancouver,Meats,Meat Cutter,Male,STORES
44732,12/31/2013 0:00,4/26/1988,6/1/2010,Terrace,Customer Service,Cashier,Female,STORES
38158,12/31/2012 0:00,3/13/1981,8/28/2006,Nanaimo,Processed Foods,Shelf Stocker,Female,STORES
860,12/31/2007 0:00,2/6/1950,3/27/1990,Kamloops,Processed Foods,Processed Foods Manager,Male,STORES


In [23]:
# Transformer for date columns that extract year, month and day features
class DateTransformer:
    def fit(self,X,y):
        return self
    def transform(self,X):
        for column in X.columns:
            X[column]=pd.to_datetime(X[column])
            X[column +'_year']=X[column].apply(lambda x: x.year)
            X[column +'_month']=X[column].apply(lambda x: x.month)
            X[column +'_day']=X[column].apply(lambda x: x.day)
            X=X.drop(column,axis=1)
        return X

In [26]:
#classify features by type
binary_features=['gender_full','BUSINESS_UNIT']
date_features=['recorddate_key','birthdate_key','orighiredate_key']
nominal_features=['city_name','department_name','job_title']

#Construct transformers to handle each type of features
binary_transformer=Pipeline(steps=[
    ('ordinal',OrdinalEncoder(categories='auto'))
])

nominal_transformer=Pipeline(steps=[
    ('nominal',OneHotEncoder())
])
    
date_transformer=Pipeline(steps=[
    ('date',DateTransformer())
])



In [28]:
# Build a preprocessing transformer with ColumnTransformer
preprocessor=ColumnTransformer(transformers=[
    ('binary',binary_transformer, binary_features),
    ('nominal',nominal_transformer, nominal_features),
    ('date',date_transformer, date_features),
    
],sparse_threshold=0)

In [30]:
# Build the final pipeline
model=Pipeline(steps=[
    ('preprocessor',preprocessor),
    ('scaler',StandardScaler()),
    ('classifier',LogisticRegression())
])

# Training

In [31]:
model.fit(X_train,y_train)


# Results

In [34]:
acc=model.score(X_test,y_test)
print('Test Accuracy: {:.2f}%'.format(acc*100))

Test Accuracy: 100.00%
