# Creating a logistic regression to predict absenteeism

## import libraries

In [1]:
import numpy as np
import pandas as pd

## load the preprocessed data

In [2]:
data_preprocessed = pd.read_csv('Absenteeism_preprocessed.csv')

In [3]:
data_preprocessed.head()

Unnamed: 0,Reason_Type1,Reason_Type2,Reason_Type3,Reason_Type4,Month Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,2
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,4
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,2


## Create the targets

In [4]:
data_preprocessed['Absenteeism Time in Hours'].median()

3.0

### first approach

In [5]:
#absent_classifier = []
#for i in range (data_preprocessed.shape[0]):
#    if data_preprocessed['Absenteeism Time in Hours'][i] > 3:
#        absent_classifier.append(1)
#    else:
#        absent_classifier.append(0)
    

In [6]:
#absent_classifier

### better approach

In [7]:
targets = np.where(data_preprocessed['Absenteeism Time in Hours'] > data_preprocessed['Absenteeism Time in Hours'].median(), 1, 0 )

In [8]:
data_preprocessed['Excessive Absenteeism'] = targets
data_preprocessed.head()

Unnamed: 0,Reason_Type1,Reason_Type2,Reason_Type3,Reason_Type4,Month Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Excessive Absenteeism
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4,1
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,2,0
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,4,1
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,2,0


### checking if data is balanced

In [9]:
data_preprocessed['Excessive Absenteeism'].value_counts()

Excessive Absenteeism
0    381
1    319
Name: count, dtype: int64

### create a checkpoint

In [10]:
data_with_targets = data_preprocessed.drop(['Absenteeism Time in Hours','Day of the Week', 'Daily Work Load Average', 'Distance to Work'], axis=1)

In [11]:
data_with_targets.head()

Unnamed: 0,Reason_Type1,Reason_Type2,Reason_Type3,Reason_Type4,Month Value,Transportation Expense,Age,Body Mass Index,Education,Children,Pets,Excessive Absenteeism
0,0,0,0,1,7,289,33,30,0,2,1,1
1,0,0,0,0,7,118,50,31,0,1,0,0
2,0,0,0,1,7,179,38,31,0,0,0,0
3,1,0,0,0,7,279,39,24,0,2,0,1
4,0,0,0,1,7,289,33,30,0,2,1,0


## Select inputs for logistic regression

In [12]:
unscaled_inputs = data_with_targets.iloc[:,:-1]


In [13]:
unscaled_inputs.head()

Unnamed: 0,Reason_Type1,Reason_Type2,Reason_Type3,Reason_Type4,Month Value,Transportation Expense,Age,Body Mass Index,Education,Children,Pets
0,0,0,0,1,7,289,33,30,0,2,1
1,0,0,0,0,7,118,50,31,0,1,0
2,0,0,0,1,7,179,38,31,0,0,0
3,1,0,0,0,7,279,39,24,0,2,0
4,0,0,0,1,7,289,33,30,0,2,1


## Standardize the data

In [14]:
#from sklearn.preprocessing import StandardScaler

#absenteeism_scaler = StandardScaler()

In [15]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler

class CustomScaler(BaseEstimator, TransformerMixin):
    def __init__(self,columns):
        self.scaler = StandardScaler()
        self.columns = columns
        self.mean = None
        self.var = None

    def fit(self, X, y=None):
        self.scaler.fit(X[self.columns], y)
        self.mean = np.mean(X[self.columns])
        self.var = np.var(X[self.columns])
        return self

    def transform(self, X,y=None,copy=None):
        init_col_order = X.columns
        X_scaled = pd.DataFrame(self.scaler.transform(X[self.columns]), columns=self.columns)
        X_not_scaled = X.loc[:,~X.columns.isin(self.columns)]
        return pd.concat([X_not_scaled,X_scaled], axis = 1)[init_col_order]

In [16]:
unscaled_inputs.columns.values

array(['Reason_Type1', 'Reason_Type2', 'Reason_Type3', 'Reason_Type4',
       'Month Value', 'Transportation Expense', 'Age', 'Body Mass Index',
       'Education', 'Children', 'Pets'], dtype=object)

In [17]:
columns_to_scale = ['Month Value', 'Transportation Expense', 'Age',
       'Body Mass Index', 'Children', 'Pets']

In [18]:
absenteeism_scaler = CustomScaler(columns_to_scale)

In [19]:
absenteeism_scaler.fit(unscaled_inputs)

  return var(axis=axis, dtype=dtype, out=out, ddof=ddof, **kwargs)


In [20]:
scaled_inputs = absenteeism_scaler.transform(unscaled_inputs)

In [21]:
scaled_inputs

Unnamed: 0,Reason_Type1,Reason_Type2,Reason_Type3,Reason_Type4,Month Value,Transportation Expense,Age,Body Mass Index,Education,Children,Pets
0,0,0,0,1,0.182726,1.005844,-0.536062,0.767431,0,0.880469,0.268487
1,0,0,0,0,0.182726,-1.574681,2.130803,1.002633,0,-0.019280,-0.589690
2,0,0,0,1,0.182726,-0.654143,0.248310,1.002633,0,-0.919030,-0.589690
3,1,0,0,0,0.182726,0.854936,0.405184,-0.643782,0,0.880469,-0.589690
4,0,0,0,1,0.182726,1.005844,-0.536062,0.767431,0,0.880469,0.268487
...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,-0.388293,-0.654143,0.562059,-1.114186,1,0.880469,-0.589690
696,1,0,0,0,-0.388293,0.040034,-1.320435,-0.643782,0,-0.019280,1.126663
697,1,0,0,0,-0.388293,1.624567,-1.320435,-0.408580,1,-0.919030,-0.589690
698,0,0,0,1,-0.388293,0.190942,-0.692937,-0.408580,1,-0.919030,-0.589690


In [22]:
scaled_inputs.shape

(700, 11)

## Splitting the data (train, validation, test)

In [23]:
from sklearn.model_selection import train_test_split
#train_test_split(inputs,targets)

In [24]:
train_test_split(scaled_inputs, targets)

[     Reason_Type1  Reason_Type2  Reason_Type3  Reason_Type4  Month Value  \
 458             1             0             0             0    -0.102784   
 633             0             0             0             1    -0.959313   
 101             0             0             0             1     1.610276   
 491             0             0             0             1     0.468236   
 301             0             0             1             0     1.039256   
 ..            ...           ...           ...           ...          ...   
 566             0             0             0             1     1.610276   
 498             0             0             0             1     0.753746   
 261             1             0             0             0     0.468236   
 359             0             0             1             0    -1.530333   
 317             1             0             0             0     1.324766   
 
      Transportation Expense       Age  Body Mass Index  Education  Childr

In [25]:
x_train, x_test, y_train, y_test = train_test_split(scaled_inputs, targets, train_size = 0.8, random_state = 20)

In [26]:
x_train.shape, y_train.shape

((560, 11), (560,))

In [27]:
x_test.shape, y_test.shape

((140, 11), (140,))

## Logistic regression with sklearn

In [28]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

In [29]:
reg = LogisticRegression()

In [30]:
reg.fit(x_train, y_train)

In [31]:
reg.score(x_train, y_train)

0.7732142857142857

## Check the accuracy manually

In [32]:
model_outputs = reg.predict(x_train)
model_outputs

array([0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1,
       1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0,
       0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0,
       1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1,
       1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1,
       0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0,

In [33]:
model_outputs == y_train

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True, False,  True, False, False,  True,  True,  True,  True,
       False,  True, False,  True, False, False,  True,  True,  True,
       False,  True,  True,  True,  True,  True,  True,  True,  True,
       False, False, False, False,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True, False,  True,  True,  True,
        True,  True,  True,  True,  True, False,  True,  True,  True,
        True,  True,  True,  True,  True,  True, False,  True,  True,
        True,  True,  True, False,  True,  True,  True,  True,  True,
       False,  True, False,  True,  True, False, False, False,  True,
        True,  True,  True,  True,  True,  True,  True, False,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True, False,  True,  True,  True,  True,
       False,  True,  True,  True,  True, False,  True,  True,  True,
        True,  True,

In [34]:
np.sum(model_outputs == y_train)

433

In [35]:
model_outputs.shape[0]

560

In [36]:
439/560

0.7839285714285714

## Find the intercept and coefficients

In [37]:
reg.intercept_

array([-1.6469898])

In [38]:
reg.coef_

array([[ 2.80000644,  0.95174778,  3.1140605 ,  0.83835931,  0.15897713,
         0.60513709, -0.16990589,  0.27998236, -0.21017416,  0.34842434,
        -0.27721907]])

In [39]:
unscaled_inputs.columns.values

array(['Reason_Type1', 'Reason_Type2', 'Reason_Type3', 'Reason_Type4',
       'Month Value', 'Transportation Expense', 'Age', 'Body Mass Index',
       'Education', 'Children', 'Pets'], dtype=object)

In [40]:
feature_name = unscaled_inputs.columns.values

In [41]:
summary_table = pd.DataFrame(columns = ['Feature Name'], data = feature_name)
summary_table['Coefficient'] = np.transpose(reg.coef_)
summary_table 

Unnamed: 0,Feature Name,Coefficient
0,Reason_Type1,2.800006
1,Reason_Type2,0.951748
2,Reason_Type3,3.114061
3,Reason_Type4,0.838359
4,Month Value,0.158977
5,Transportation Expense,0.605137
6,Age,-0.169906
7,Body Mass Index,0.279982
8,Education,-0.210174
9,Children,0.348424


In [42]:
summary_table.index = summary_table.index + 1
summary_table.loc[0] = ['Intercept', reg.intercept_[0]]
summary_table = summary_table.sort_index()


In [43]:
summary_table

Unnamed: 0,Feature Name,Coefficient
0,Intercept,-1.64699
1,Reason_Type1,2.800006
2,Reason_Type2,0.951748
3,Reason_Type3,3.114061
4,Reason_Type4,0.838359
5,Month Value,0.158977
6,Transportation Expense,0.605137
7,Age,-0.169906
8,Body Mass Index,0.279982
9,Education,-0.210174


## Interpret coefficients

In [44]:
summary_table['Odds ratio'] = np.exp(summary_table.Coefficient)

In [45]:
summary_table.sort_values(['Odds ratio'], ascending = False)

Unnamed: 0,Feature Name,Coefficient,Odds ratio
3,Reason_Type3,3.114061,22.51227
1,Reason_Type1,2.800006,16.444753
2,Reason_Type2,0.951748,2.590233
4,Reason_Type4,0.838359,2.31257
6,Transportation Expense,0.605137,1.831503
10,Children,0.348424,1.416833
8,Body Mass Index,0.279982,1.323106
5,Month Value,0.158977,1.172311
7,Age,-0.169906,0.843744
9,Education,-0.210174,0.810443


## Testing the model

In [46]:
reg.score(x_test,y_test)

0.75

In [47]:
predicted_proba = reg.predict_proba(x_test)
predicted_proba

array([[0.71342516, 0.28657484],
       [0.5873216 , 0.4126784 ],
       [0.44016153, 0.55983847],
       [0.78163061, 0.21836939],
       [0.08407928, 0.91592072],
       [0.3348226 , 0.6651774 ],
       [0.29971206, 0.70028794],
       [0.13112385, 0.86887615],
       [0.78627908, 0.21372092],
       [0.74906578, 0.25093422],
       [0.49395555, 0.50604445],
       [0.22492002, 0.77507998],
       [0.07135527, 0.92864473],
       [0.73173354, 0.26826646],
       [0.30957854, 0.69042146],
       [0.54726422, 0.45273578],
       [0.55051921, 0.44948079],
       [0.53926379, 0.46073621],
       [0.40197149, 0.59802851],
       [0.05365482, 0.94634518],
       [0.70030387, 0.29969613],
       [0.78163061, 0.21836939],
       [0.42028246, 0.57971754],
       [0.42028246, 0.57971754],
       [0.24801464, 0.75198536],
       [0.74567806, 0.25432194],
       [0.51026557, 0.48973443],
       [0.8569309 , 0.1430691 ],
       [0.20365204, 0.79634796],
       [0.78163061, 0.21836939],
       [0.

In [48]:
predicted_proba[:,1]

array([0.28657484, 0.4126784 , 0.55983847, 0.21836939, 0.91592072,
       0.6651774 , 0.70028794, 0.86887615, 0.21372092, 0.25093422,
       0.50604445, 0.77507998, 0.92864473, 0.26826646, 0.69042146,
       0.45273578, 0.44948079, 0.46073621, 0.59802851, 0.94634518,
       0.29969613, 0.21836939, 0.57971754, 0.57971754, 0.75198536,
       0.25432194, 0.48973443, 0.1430691 , 0.79634796, 0.21836939,
       0.36947677, 0.67913195, 0.68508325, 0.52870791, 0.21836939,
       0.53505228, 0.22144744, 0.73673169, 0.40500758, 0.60504297,
       0.21072119, 0.45227108, 0.23749326, 0.39847178, 0.82763577,
       0.56771922, 0.69120847, 0.28657484, 0.2192347 , 0.2032712 ,
       0.57634482, 0.32954238, 0.6651774 , 0.26937528, 0.83323682,
       0.43484145, 0.88365871, 0.23125087, 0.33433749, 0.34451397,
       0.69915101, 0.6549938 , 0.29244583, 0.79186052, 0.20752232,
       0.26838009, 0.08710411, 0.22144744, 0.73215219, 0.30536526,
       0.22144744, 0.2900789 , 0.90443841, 0.46065771, 0.60175

## Save the model

In [49]:
import pickle

In [50]:
with open('model', 'wb') as file:
    pickle.dump(reg,file)

In [51]:
with open('scaler', 'wb') as file:
    pickle.dump(absenteeism_scaler, file)