In [1]:
#import libraries
import numpy as np
import pandas as pd

In [2]:
data_preprocessed = pd.read_csv('Absenteeism_preprocessed.csv')

In [3]:
data_preprocessed.head(10)

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Day of the week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,2
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,4
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,2
5,0,0,0,1,7,4,179,51,38,239.554,31,0,0,0,2
6,0,0,0,1,7,4,361,52,28,239.554,27,0,1,4,8
7,0,0,0,1,7,4,260,50,36,239.554,23,0,4,0,4
8,0,0,1,0,7,0,155,12,34,239.554,25,0,2,0,40
9,0,0,0,1,7,0,235,11,37,239.554,29,1,1,1,8


In [4]:
#Create targets
#The aim here is to classify two groups of abseenteeism; One that is excessively absent, and the other that is only moderately absent
#The median value will be used as the cut-off line
#Dataset is implicitly balanced using this method (roughly half are 0 and the other 1)

In [5]:
data_preprocessed['Absenteeism Time in Hours'].median()

3.0

In [6]:
#Convert into 0 & 1 targets
targets = np.where(data_preprocessed['Absenteeism Time in Hours']>
                   data_preprocessed['Absenteeism Time in Hours'].median() , 1,0)

In [7]:
targets

array([1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0,
       1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1,
       1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0,
       0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0,
       0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0,

In [8]:
#Add targets to df
data_preprocessed['Excessive Absenteeism'] = targets

In [9]:
data_preprocessed.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Day of the week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Excessive Absenteeism
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4,1
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,2,0
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,4,1
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,2,0


In [10]:
#46% of targets are 1s, dataset fairly balanced
targets.sum()/targets.shape[0]

0.45571428571428574

In [11]:
#Abseenteeism hours no longer needed due to targets
#This variable was changed at the end of the analysis; Day of the week, daily work load and distance to work were found to have a coefficient of close to 0, so they were removed for simplicity and the analysis performed again at this checkpoint
data_with_targets = data_preprocessed.drop(['Absenteeism Time in Hours','Day of the week','Daily Work Load Average','Distance to Work'],axis=1 )

In [12]:
#Select inputs for regression

In [13]:
data_with_targets.shape

(700, 12)

In [14]:
#Slice data to separate off targets
data_with_targets.iloc[:,:-1]

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Transportation Expense,Age,Body Mass Index,Education,Children,Pets
0,0,0,0,1,7,289,33,30,0,2,1
1,0,0,0,0,7,118,50,31,0,1,0
2,0,0,0,1,7,179,38,31,0,0,0
3,1,0,0,0,7,279,39,24,0,2,0
4,0,0,0,1,7,289,33,30,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,5,179,40,22,1,2,0
696,1,0,0,0,5,225,28,24,0,1,2
697,1,0,0,0,5,330,28,25,1,0,0
698,0,0,0,1,5,235,32,25,1,0,0


In [15]:
unscaled_inputs = data_with_targets.iloc[:,:-1]

In [16]:
#Create custom scaler that doesn't standardise all inputs
#If dummies are standardised, they lose it's interpretability as the reason for absenteeism is likely to be an important factor
#Alternatively, standardisation could occur before dummy creation
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler

class CustomScaler(BaseEstimator, TransformerMixin):
    
    def __init__(self,columns,copy=True, with_mean=True,with_std=True):
        self.scaler = StandardScaler(copy,with_mean,with_std)
        self.columns= columns
        self.mean_ = None
        self.var_=None
        
    def fit(self,X,y=None):
        self.scaler.fit(X[self.columns],y)
        self.mean_ = np.mean(X[self.columns])
        self.var_ = np.var(X[self.columns])
        return self
    
    def transform(self,X,y=None, copy=None):
        init_col_order = X.columns
        X_scaled = pd.DataFrame(self.scaler.transform(X[self.columns]), columns=self.columns)
        X_not_scaled= X.loc[:,~X.columns.isin(self.columns)]
        return pd.concat([X_not_scaled, X_scaled], axis =1)[init_col_order]

#omit dummy variables        
columns_to_omit = ['Reason 1','Reason 2', 'Reason 3','Reason 4', 'Education']
#Use list comprehension incase more columns are required to be omitted
columns_to_scale = [x for x in unscaled_inputs.columns.values if x not in columns_to_omit]
absenteeism_scaler= CustomScaler(columns_to_scale )
absenteeism_scaler.fit(unscaled_inputs)
scaled_inputs = absenteeism_scaler.transform(unscaled_inputs)
#Dummies remain untouched
scaled_inputs   




Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Transportation Expense,Age,Body Mass Index,Education,Children,Pets
0,-0.577350,-0.092981,-0.314485,0.821365,0.182726,1.005844,-0.536062,0.767431,0,0.880469,0.268487
1,-0.577350,-0.092981,-0.314485,-1.217485,0.182726,-1.574681,2.130803,1.002633,0,-0.019280,-0.589690
2,-0.577350,-0.092981,-0.314485,0.821365,0.182726,-0.654143,0.248310,1.002633,0,-0.919030,-0.589690
3,1.732051,-0.092981,-0.314485,-1.217485,0.182726,0.854936,0.405184,-0.643782,0,0.880469,-0.589690
4,-0.577350,-0.092981,-0.314485,0.821365,0.182726,1.005844,-0.536062,0.767431,0,0.880469,0.268487
...,...,...,...,...,...,...,...,...,...,...,...
695,1.732051,-0.092981,-0.314485,-1.217485,-0.388293,-0.654143,0.562059,-1.114186,1,0.880469,-0.589690
696,1.732051,-0.092981,-0.314485,-1.217485,-0.388293,0.040034,-1.320435,-0.643782,0,-0.019280,1.126663
697,1.732051,-0.092981,-0.314485,-1.217485,-0.388293,1.624567,-1.320435,-0.408580,1,-0.919030,-0.589690
698,-0.577350,-0.092981,-0.314485,0.821365,-0.388293,0.190942,-0.692937,-0.408580,1,-0.919030,-0.589690


In [17]:
scaled_inputs.shape

(700, 11)

In [18]:
#Split data and shuffle to remove any dependenices i.e. day of the week effect, and remove chances of overfitting the data

In [19]:
from sklearn.model_selection import train_test_split

In [20]:
train_test_split(scaled_inputs, targets)

[     Reason_1  Reason_2  Reason_3  Reason_4  Month Value  \
 664  1.732051 -0.092981 -0.314485 -1.217485    -0.673803   
 579 -0.577350 -0.092981  3.179797 -1.217485    -1.530333   
 138  1.732051 -0.092981 -0.314485 -1.217485    -1.244823   
 629 -0.577350 -0.092981  3.179797 -1.217485    -0.959313   
 632 -0.577350 -0.092981 -0.314485  0.821365    -0.959313   
 ..        ...       ...       ...       ...          ...   
 51  -0.577350 -0.092981 -0.314485 -1.217485     0.753746   
 562 -0.577350 -0.092981  3.179797 -1.217485     1.610276   
 278  1.732051 -0.092981 -0.314485 -1.217485     0.753746   
 442 -0.577350 -0.092981 -0.314485  0.821365    -0.102784   
 75   1.732051 -0.092981 -0.314485 -1.217485     1.039256   
 
      Transportation Expense       Age  Body Mass Index  Education  Children  \
 664               -1.574681  0.091435         0.297027          0 -0.919030   
 579               -1.016322 -0.379188        -0.408580          0  0.880469   
 138                0.3569

In [21]:
x_train, x_test, y_train, y_test = train_test_split(scaled_inputs, targets, train_size=0.8, random_state=20)

In [22]:
print(x_train.shape, y_train.shape)

(560, 11) (560,)


In [23]:
print(x_test.shape, y_test.shape)

(140, 11) (140,)


In [24]:
#Modeling

In [25]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

In [26]:
#Training the model
reg = LogisticRegression()


In [27]:
reg.fit(x_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [28]:
reg.score(x_train,y_train)

0.7875

In [29]:
#Manually checking the accuracy

In [30]:
model_outputs = reg.predict(x_train)
model_outputs

array([0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1,
       1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0,
       0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1,
       1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1,
       1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1,
       0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0,

In [31]:
 y_train

array([0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0,
       1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1,
       1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1,
       1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1,
       1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0,

In [32]:
model_outputs ==y_train

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True, False,  True, False, False,  True,  True,  True,  True,
       False,  True, False,  True, False, False,  True,  True,  True,
       False,  True,  True,  True,  True,  True,  True,  True,  True,
       False, False, False, False,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True, False,  True,  True,  True,
        True,  True,  True,  True, False,  True, False,  True,  True,
        True,  True,  True, False,  True,  True,  True,  True,  True,
       False,  True, False,  True,  True, False, False, False,  True,
        True,  True,  True,  True,  True,  True,  True, False,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True, False,  True,  True,  True,  True,
       False,  True,  True,  True,  True, False,  True,  True,  True,
        True,  True,

In [33]:
#Convert booleans into number of true predictions
np.sum(model_outputs ==y_train)

441

In [34]:
model_outputs.shape[0]

560

In [35]:
#This is how reg.score calculates accuracy
np.sum(model_outputs ==y_train) /model_outputs.shape[0]

0.7875

In [36]:
#Finding intercepts and coeffs

In [37]:
reg.intercept_

array([-0.17163628])

In [38]:
reg.coef_

array([[ 2.06866621,  0.33461133,  1.56034928,  1.31301781,  0.18521305,
         0.69049022, -0.19796143,  0.32782753, -0.31318895,  0.37213564,
        -0.32435396]])

In [39]:
#What features do the coefficients correspond to?
scaled_inputs.columns.values

array(['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Month Value',
       'Transportation Expense', 'Age', 'Body Mass Index', 'Education',
       'Children', 'Pets'], dtype=object)

In [40]:
unscaled_inputs.columns.values

array(['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Month Value',
       'Transportation Expense', 'Age', 'Body Mass Index', 'Education',
       'Children', 'Pets'], dtype=object)

In [41]:
feature_name = unscaled_inputs.columns.values

In [42]:
summary_table = pd.DataFrame(columns=['Feature Name'], data = feature_name)
summary_table

Unnamed: 0,Feature Name
0,Reason_1
1,Reason_2
2,Reason_3
3,Reason_4
4,Month Value
5,Transportation Expense
6,Age
7,Body Mass Index
8,Education
9,Children


In [43]:
#Transpose coefficients to summary table
summary_table['Coefficients'] = np.transpose(reg.coef_)
summary_table

Unnamed: 0,Feature Name,Coefficients
0,Reason_1,2.068666
1,Reason_2,0.334611
2,Reason_3,1.560349
3,Reason_4,1.313018
4,Month Value,0.185213
5,Transportation Expense,0.69049
6,Age,-0.197961
7,Body Mass Index,0.327828
8,Education,-0.313189
9,Children,0.372136


In [44]:
#Adding intercept data by first shifting all indexs by 1, so index 0 is empty
summary_table.index = summary_table.index + 1
summary_table.loc[0] = ['Intercept', reg.intercept_[0]]
summary_table = summary_table.sort_index()

In [45]:
summary_table

Unnamed: 0,Feature Name,Coefficients
0,Intercept,-0.171636
1,Reason_1,2.068666
2,Reason_2,0.334611
3,Reason_3,1.560349
4,Reason_4,1.313018
5,Month Value,0.185213
6,Transportation Expense,0.69049
7,Age,-0.197961
8,Body Mass Index,0.327828
9,Education,-0.313189


In [46]:
#interpreting coefficients 

In [47]:
#Because logisitc regression is used, to interpret coefficient, they must be raised by the exponential to be more meaningful 
summary_table['Odds_ratio'] = np.exp(summary_table.Coefficients)


In [48]:
summary_table.sort_values('Odds_ratio', ascending = False)

Unnamed: 0,Feature Name,Coefficients,Odds_ratio
1,Reason_1,2.068666,7.91426
3,Reason_3,1.560349,4.760484
4,Reason_4,1.313018,3.717375
6,Transportation Expense,0.69049,1.994693
10,Children,0.372136,1.45083
2,Reason_2,0.334611,1.397397
8,Body Mass Index,0.327828,1.38795
5,Month Value,0.185213,1.203475
0,Intercept,-0.171636,0.842285
7,Age,-0.197961,0.820401


In [49]:
#If odds ratio is around 1, the feature is not important as multiple is 1 for a unit change in the standardised feature
#Remember, reason 0 means no reason was given and is the baseline model for the other reasons i.e. all other reason types are in comparision to reason 0
#In future, it would be worth creating another model without standardised features for greater interpretability of coefficients. 
#Because we are now going to predict absenteeism, standardisation leads to greater accuracy so we stick with that.

In [50]:
#Testing the data to see it's actual accuracy
#At this stage, the model cannot be tweaked as it would render the test data to be acting like training data, and overfitting is likely to occur
reg.score(x_test,y_test)

0.7357142857142858

In [51]:
#Instead of outputting the predicted 0s and 1s, we can find the probabilities of 0 or 1
predicted_proba = reg.predict_proba(x_test)
predicted_proba

array([[0.70823956, 0.29176044],
       [0.57293437, 0.42706563],
       [0.39748224, 0.60251776],
       [0.78749486, 0.21250514],
       [0.06719693, 0.93280307],
       [0.31207365, 0.68792635],
       [0.28664053, 0.71335947],
       [0.08120082, 0.91879918],
       [0.80017608, 0.19982392],
       [0.74995457, 0.25004543],
       [0.46819712, 0.53180288],
       [0.18512014, 0.81487986],
       [0.04121567, 0.95878433],
       [0.75142701, 0.24857299],
       [0.23903853, 0.76096147],
       [0.53714554, 0.46285446],
       [0.53420084, 0.46579916],
       [0.52102189, 0.47897811],
       [0.40678406, 0.59321594],
       [0.02773414, 0.97226586],
       [0.70236673, 0.29763327],
       [0.78749486, 0.21250514],
       [0.40670612, 0.59329388],
       [0.40670612, 0.59329388],
       [0.17559234, 0.82440766],
       [0.75454373, 0.24545627],
       [0.48937739, 0.51062261],
       [0.87896092, 0.12103908],
       [0.13142638, 0.86857362],
       [0.78749486, 0.21250514],
       [0.

In [52]:
#1st col is p(0) and 2nd col is p(1)
#Therefore probability of excessive abseenteeism is p(1)
predicted_proba[:,1]

array([0.29176044, 0.42706563, 0.60251776, 0.21250514, 0.93280307,
       0.68792635, 0.71335947, 0.91879918, 0.19982392, 0.25004543,
       0.53180288, 0.81487986, 0.95878433, 0.24857299, 0.76096147,
       0.46285446, 0.46579916, 0.47897811, 0.59321594, 0.97226586,
       0.29763327, 0.21250514, 0.59329388, 0.59329388, 0.82440766,
       0.24545627, 0.51062261, 0.12103908, 0.86857362, 0.21250514,
       0.37628326, 0.69395995, 0.71016857, 0.55802337, 0.21250514,
       0.56289228, 0.2084135 , 0.80857331, 0.41359763, 0.62953752,
       0.20379034, 0.4242538 , 0.22639945, 0.10462937, 0.85742011,
       0.6523714 , 0.70507958, 0.29176044, 0.21052579, 0.19534428,
       0.56865035, 0.07901966, 0.68792635, 0.2680619 , 0.8589383 ,
       0.45311982, 0.92991324, 0.21727659, 0.08573649, 0.08997337,
       0.70893813, 0.67646259, 0.28670381, 0.85947072, 0.18954376,
       0.27039693, 0.01391914, 0.2084135 , 0.81053586, 0.29013739,
       0.2084135 , 0.06812914, 0.92885154, 0.47883471, 0.63880

In [53]:
#Save the model

In [54]:
import pickle

In [55]:
with open('module', 'wb') as file:
    pickle.dump(reg,file)

In [56]:
#Custom scaler must also be saved as model was heavily dependent on training data
with open('scaler','wb') as file:
    pickle.dump(absenteeism_scaler,file)