In [82]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler

from sklearn.base import BaseEstimator, TransformerMixin  # import the libraries needed to create the Custom Scaler

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn import metrics

In [83]:
# load the preprocessed CSV data
data_preprocessed = pd.read_csv('df_preprocessed.csv')

In [84]:
data_preprocessed

Unnamed: 0,Reason1,Reason2,Reason3,Reason4,Month,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,2
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,4
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,5,2,179,22,40,237.656,22,1,2,0,8
696,1,0,0,0,5,2,225,26,28,237.656,24,0,1,2,3
697,1,0,0,0,5,3,330,16,28,237.656,25,1,0,0,8
698,0,0,0,1,5,3,235,16,32,237.656,25,1,0,0,2


### Create the target

In [85]:
data_preprocessed['Absenteeism Time in Hours'].median()

3.0

In [86]:
# Use the median as the cut-off
targets = np.where(data_preprocessed['Absenteeism Time in Hours'] > 
                   data_preprocessed['Absenteeism Time in Hours'].median(), 1, 0)
targets

array([1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0,
       1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1,
       1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0,
       0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0,
       0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0,

In [87]:
# Create a Series in the original data frame that will contain the targets for the regression
data_preprocessed['Excessive Absenteeism'] = targets

In [88]:
# Check if dataset is balanced
targets.sum() / targets.shape[0]

0.45571428571428574

In [89]:
# Create a checkpoint by dropping unecessary variables
# also drop the variables we 'eliminated' after exploring the weights
data_with_targets = data_preprocessed.drop(['Absenteeism Time in Hours','Day of the Week',
                                            'Daily Work Load Average','Distance to Work'],axis=1)

In [90]:
data_with_targets is data_preprocessed

False

### Select the inputs for regression & Standardize the inputs

In [91]:
scaler = StandardScaler()

class CustomScaler(BaseEstimator,TransformerMixin): 
    def __init__(self,columns,copy=True,with_mean=True,with_std=True):        
        # scaler is nothing but a Standard Scaler object
        self.scaler = StandardScaler(copy, with_mean, with_std)
        self.columns = columns
        self.mean_ = None
        self.var_ = None        
    
    def fit(self, X, y=None):
        self.scaler.fit(X[self.columns], y)
        self.mean_ = np.mean(X[self.columns])
        self.var_ = np.var(X[self.columns])
        return self
    
    # the transform method which does the actual scaling
    def transform(self, X, y=None, copy=None):       
        # record the initial order of the columns
        init_col_order = X.columns      
        # scale all features that you chose when creating the instance of the class
        X_scaled = pd.DataFrame(self.scaler.transform(X[self.columns]), columns=self.columns)        
        # declare a variable containing all information that was not scaled
        X_not_scaled = X.loc[:,~X.columns.isin(self.columns)]        
        # return a data frame which contains all scaled features and all 'not scaled' features
        # use the original order (that you recorded in the beginning)
        return pd.concat([X_not_scaled, X_scaled], axis=1)[init_col_order]

In [92]:
unscaled_inputs = data_with_targets.iloc[:, :-1]  # select all rows and columns without the last column

In [93]:
# Check what are all columns that we've got
unscaled_inputs.columns.values

array(['Reason1', 'Reason2', 'Reason3', 'Reason4', 'Month',
       'Transportation Expense', 'Age', 'Body Mass Index', 'Education',
       'Children', 'Pets'], dtype=object)

In [94]:
# Create the columns to scale, based on the columns to omit
# use list comprehension to iterate over the list
# select the columns to omit
columns_to_omit = ['Reason1', 'Reason2', 'Reason3', 'Reason4', 'Education']
columns_to_scale = [x for x in unscaled_inputs.columns.values if x not in columns_to_omit]

In [95]:
# declare a scaler object, specifying the columns you want to scale
absenteeism_scaler = CustomScaler(columns_to_scale)
# fit the data (calculate mean and standard deviation)
absenteeism_scaler.fit(unscaled_inputs)
# transform them
scaled_inputs = absenteeism_scaler.transform(unscaled_inputs)

In [96]:
scaled_inputs.shape
scaled_inputs

Unnamed: 0,Reason1,Reason2,Reason3,Reason4,Month,Transportation Expense,Age,Body Mass Index,Education,Children,Pets
0,0,0,0,1,0.030796,1.005844,-0.536062,0.767431,0,0.880469,0.268487
1,0,0,0,0,0.030796,-1.574681,2.130803,1.002633,0,-0.019280,-0.589690
2,0,0,0,1,0.030796,-0.654143,0.248310,1.002633,0,-0.919030,-0.589690
3,1,0,0,0,0.030796,0.854936,0.405184,-0.643782,0,0.880469,-0.589690
4,0,0,0,1,0.030796,1.005844,-0.536062,0.767431,0,0.880469,0.268487
...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,-0.568019,-0.654143,0.562059,-1.114186,1,0.880469,-0.589690
696,1,0,0,0,-0.568019,0.040034,-1.320435,-0.643782,0,-0.019280,1.126663
697,1,0,0,0,-0.568019,1.624567,-1.320435,-0.408580,1,-0.919030,-0.589690
698,0,0,0,1,-0.568019,0.190942,-0.692937,-0.408580,1,-0.919030,-0.589690


### Split train and test set

In [97]:
x_train, x_test, y_train, y_test = train_test_split(scaled_inputs, targets, test_size = 0.2, random_state = 20)
display(x_train, y_train, x_test, y_test)

Unnamed: 0,Reason1,Reason2,Reason3,Reason4,Month,Transportation Expense,Age,Body Mass Index,Education,Children,Pets
346,0,0,0,1,1.527833,-0.654143,0.248310,1.002633,0,-0.919030,-0.589690
91,0,0,1,0,1.228426,-0.654143,0.562059,-1.114186,1,0.880469,-0.589690
299,1,0,0,0,0.929019,-0.654143,-1.006686,-1.819793,1,-0.919030,-0.589690
129,0,0,1,0,1.527833,-0.654143,-1.006686,-1.819793,1,-0.919030,-0.589690
695,1,0,0,0,-0.568019,-0.654143,0.562059,-1.114186,1,0.880469,-0.589690
...,...,...,...,...,...,...,...,...,...,...,...
218,1,0,0,0,1.228426,-1.574681,2.130803,1.002633,0,-0.019280,-0.589690
223,0,0,0,1,-0.268611,1.036026,0.562059,-0.408580,0,-0.019280,0.268487
271,0,0,0,1,0.629611,-0.654143,0.248310,1.002633,0,-0.919030,-0.589690
474,0,0,0,0,0.030796,2.092381,-1.320435,0.061825,0,-0.019280,2.843016


array([0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0,
       1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1,
       1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1,
       1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1,
       1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0,

Unnamed: 0,Reason1,Reason2,Reason3,Reason4,Month,Transportation Expense,Age,Body Mass Index,Education,Children,Pets
535,0,0,0,1,0.929019,-0.654143,0.248310,1.002633,0,-0.919030,-0.589690
281,0,0,0,1,0.629611,1.036026,0.562059,-0.408580,0,-0.019280,0.268487
324,0,0,0,1,1.228426,0.190942,1.032682,2.649049,0,-0.019280,-0.589690
645,0,0,0,1,-1.166834,-0.654143,0.248310,1.002633,0,-0.919030,-0.589690
10,1,0,0,0,0.030796,0.568211,-0.065439,-0.878984,0,2.679969,-0.589690
...,...,...,...,...,...,...,...,...,...,...,...
136,0,0,0,0,-1.765648,1.005844,-0.536062,0.767431,0,0.880469,0.268487
430,0,0,0,1,1.228426,2.092381,-1.320435,0.061825,0,-0.019280,2.843016
32,0,0,0,1,0.330204,0.190942,0.091435,0.532229,1,-0.019280,0.268487
449,0,0,0,1,-0.268611,0.356940,0.718933,-0.878984,0,-0.919030,-0.589690


array([0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0,
       1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0,
       0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0,
       0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1,
       0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 1, 0, 0, 1, 0, 0])

### Train the Logistic Regression Model

In [98]:
reg_mod = LogisticRegression().fit(x_train, y_train)

In [99]:
reg_mod.coef_ 

array([[ 1.61232741, -0.08710849,  1.97163111, -0.64491636,  0.02601198,
         0.59180985, -0.15649919,  0.2754436 , -0.30072808,  0.32204547,
        -0.29066065]])

In [100]:
reg_mod.intercept_

array([-0.31748159])

In [101]:
feature_name = unscaled_inputs.columns.values
summary_table = pd.DataFrame (columns=['Feature name'], data = feature_name)
summary_table['Coefficient'] = reg_mod.coef_.T
summary_table

Unnamed: 0,Feature name,Coefficient
0,Reason1,1.612327
1,Reason2,-0.087108
2,Reason3,1.971631
3,Reason4,-0.644916
4,Month,0.026012
5,Transportation Expense,0.59181
6,Age,-0.156499
7,Body Mass Index,0.275444
8,Education,-0.300728
9,Children,0.322045


In [102]:
# Add intercept to the summary table

# move all indices by 1
summary_table.index = summary_table.index + 1
# add the intercept at index 0
summary_table.loc[0] = ['Intercept', reg_mod.intercept_[0]]

# sort the df by index
summary_table = summary_table.sort_index()
summary_table

Unnamed: 0,Feature name,Coefficient
0,Intercept,-0.317482
1,Reason1,1.612327
2,Reason2,-0.087108
3,Reason3,1.971631
4,Reason4,-0.644916
5,Month,0.026012
6,Transportation Expense,0.59181
7,Age,-0.156499
8,Body Mass Index,0.275444
9,Education,-0.300728


### Interpreting the coefficients

In [103]:
# Create a new Series called: 'Odds ratio' which will show the.. odds ratio of each feature
summary_table['Odds_ratio'] = np.exp(summary_table.Coefficient)
# Sort the table according to odds ratio
summary_table.sort_values('Odds_ratio', ascending=False)

Unnamed: 0,Feature name,Coefficient,Odds_ratio
3,Reason3,1.971631,7.182382
1,Reason1,1.612327,5.014468
6,Transportation Expense,0.59181,1.807256
10,Children,0.322045,1.379948
8,Body Mass Index,0.275444,1.317115
5,Month,0.026012,1.026353
2,Reason2,-0.087108,0.916578
7,Age,-0.156499,0.855132
11,Pets,-0.290661,0.747769
9,Education,-0.300728,0.740279


### Test the model

In [104]:
# Access the accuracy
pred = reg_mod.score(x_test, y_test)
pred

0.7285714285714285

In [105]:
# find the predicted probabilities of each class
# the first column shows the probability of a particular observation to be 0, while the second one - to be 1
predicted_proba = reg_mod.predict_proba(x_test)
predicted_proba

array([[0.77076248, 0.22923752],
       [0.64968167, 0.35031833],
       [0.5210383 , 0.4789617 ],
       [0.78025258, 0.21974742],
       [0.08057783, 0.91942217],
       [0.27089136, 0.72910864],
       [0.28388841, 0.71611159],
       [0.10458021, 0.89541979],
       [0.77595871, 0.22404129],
       [0.77486457, 0.22513543],
       [0.41865385, 0.58134615],
       [0.14410715, 0.85589285],
       [0.05894143, 0.94105857],
       [0.61952164, 0.38047836],
       [0.25757679, 0.74242321],
       [0.58208876, 0.41791124],
       [0.57659874, 0.42340126],
       [0.42625453, 0.57374547],
       [0.33024304, 0.66975696],
       [0.05786968, 0.94213032],
       [0.7704964 , 0.2295036 ],
       [0.78025258, 0.21974742],
       [0.42392779, 0.57607221],
       [0.41633849, 0.58366151],
       [0.20572099, 0.79427901],
       [0.77730975, 0.22269025],
       [0.56006429, 0.43993571],
       [0.8918655 , 0.1081345 ],
       [0.20091886, 0.79908114],
       [0.78025258, 0.21974742],
       [0.

In [106]:
# select ONLY the probabilities referring to 1s
predicted_proba[:,1]

array([0.22923752, 0.35031833, 0.4789617 , 0.21974742, 0.91942217,
       0.72910864, 0.71611159, 0.89541979, 0.22404129, 0.22513543,
       0.58134615, 0.85589285, 0.94105857, 0.38047836, 0.74242321,
       0.41791124, 0.42340126, 0.57374547, 0.66975696, 0.94213032,
       0.2295036 , 0.21974742, 0.57607221, 0.58366151, 0.79427901,
       0.22269025, 0.43993571, 0.1081345 , 0.79908114, 0.21974742,
       0.49903085, 0.72786594, 0.73974087, 0.5851323 , 0.21974742,
       0.69387939, 0.22539815, 0.80840164, 0.39592345, 0.54420383,
       0.21841499, 0.52900527, 0.22000564, 0.69470197, 0.82391035,
       0.660601  , 0.7436025 , 0.22242978, 0.23037687, 0.21708839,
       0.54449621, 0.58533544, 0.72910864, 0.22354511, 0.50681861,
       0.35387165, 0.91078824, 0.23142998, 0.63300662, 0.634814  ,
       0.72385742, 0.73369858, 0.23561188, 0.80056907, 0.21402431,
       0.22786434, 0.24484595, 0.22676082, 0.80135531, 0.38784836,
       0.21734435, 0.62522935, 0.89851469, 0.39220419, 0.53550

### Save the model

In [107]:
# import the relevant module
import pickle

# pickle the model file
with open('Prediction/Module/model', 'wb') as file:
    pickle.dump(reg_mod, file)
    
# pickle the scaler file
with open('Prediction/Module/scaler','wb') as file:
    pickle.dump(absenteeism_scaler, file)