In [96]:
import pandas as pd
import numpy as np

# Load the data

In [97]:
df_preproc = pd.read_csv('../absenteeism/data/Preprocessed_absenteeism_data.csv')

In [98]:
df_preproc.head()

Unnamed: 0,reason 1,reason 2,reason 3,reason 4,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Month,Day of the week
0,0,0,0,1,289,36,33,239.554,30,0,2,1,4,7,1
1,0,0,0,0,118,13,50,239.554,31,0,1,0,0,7,1
2,0,0,0,1,179,51,38,239.554,31,0,0,0,2,7,2
3,1,0,0,0,279,5,39,239.554,24,0,2,0,4,7,3
4,0,0,0,1,289,36,33,239.554,30,0,2,1,2,7,3


# Target definition

In [99]:
df_preproc['Absenteeism Time in Hours'].median()

3.0

In [100]:
df_preproc['Absenteeism'] = np.where(df_preproc['Absenteeism Time in Hours'] > df_preproc['Absenteeism Time in Hours'].median(), 1, 0)

In [101]:
df_preproc['Absenteeism'].sum()/df_preproc.shape[0]

0.45571428571428574

In [102]:
df_preproc.drop(columns ='Absenteeism Time in Hours', inplace = True )

# Feature and target selection

In [103]:
df_preproc.shape

(700, 15)

In [104]:
X = df_preproc.drop(columns='Absenteeism')

In [107]:
# Reverse elimination
X = X.drop(['Day of the week','Daily Work Load Average','Distance to Work'],axis=1)

In [105]:
y = df_preproc['Absenteeism']

# Standardize numerical features

In [116]:
X.columns

Index(['reason 1', 'reason 2', 'reason 3', 'reason 4',
       'Transportation Expense', 'Age', 'Body Mass Index', 'Education',
       'Children', 'Pets', 'Month'],
      dtype='object')

In [117]:
X_unscaled = X[['Transportation Expense', 'Age',
       'Body Mass Index', 'Children',
       'Pets', 'Month']]

In [118]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(X_unscaled)

StandardScaler()

In [119]:
import pickle
with open('../absenteeism/scaler','wb') as file:
    pickle.dump(scaler,file)

In [120]:
with open('../absenteeism/scaler', 'rb') as scaler_file:

    abs_scaler = pickle.load(scaler_file)

In [121]:
X_scaled = abs_scaler.transform(X_unscaled)

In [122]:
X = X.drop(columns = ['Transportation Expense', 'Age',
       'Body Mass Index', 'Children',
       'Pets', 'Month'])

In [126]:
X[['Transportation Expense', 'Age',
       'Body Mass Index', 'Children',
       'Pets', 'Month']] = X_scaled
X.columns

Index(['reason 1', 'reason 2', 'reason 3', 'reason 4', 'Education',
       'Transportation Expense', 'Age', 'Body Mass Index', 'Children', 'Pets',
       'Month'],
      dtype='object')

# Train test split

In [24]:
from sklearn.model_selection import train_test_split

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X,y, train_size = 0.8)

# Logistic regression

In [26]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

## Training the model

In [27]:
logreg = LogisticRegression()

In [28]:
logreg.fit(X_train,y_train)

LogisticRegression()

In [29]:
logreg.score(X_train, y_train)

0.7767857142857143

## Analyze coefficients

In [30]:
logreg.intercept_

array([-1.69783724])

In [31]:
logreg.coef_

array([[ 2.79698998,  1.05680472,  3.1877269 ,  0.73203298, -0.23252329,
         0.49908463, -0.22036721,  0.13656264,  0.33571319, -0.36826842,
         0.13061851]])

In [32]:
feature_names = X.columns.values

In [33]:
summary_table = pd.DataFrame (columns = ['Feature name'], data = feature_names)
summary_table['Coefficient'] = np.transpose(logreg.coef_)

In [34]:
summary_table

Unnamed: 0,Feature name,Coefficient
0,reason 1,2.79699
1,reason 2,1.056805
2,reason 3,3.187727
3,reason 4,0.732033
4,Education,-0.232523
5,Transportation Expense,0.499085
6,Age,-0.220367
7,Body Mass Index,0.136563
8,Children,0.335713
9,Pets,-0.368268


In [35]:
summary_table.index = summary_table.index + 1
summary_table.loc[0] = ['Intercept', logreg.intercept_[0]]

In [36]:
summary_table.sort_index(inplace = True)
summary_table

Unnamed: 0,Feature name,Coefficient
0,Intercept,-1.697837
1,reason 1,2.79699
2,reason 2,1.056805
3,reason 3,3.187727
4,reason 4,0.732033
5,Education,-0.232523
6,Transportation Expense,0.499085
7,Age,-0.220367
8,Body Mass Index,0.136563
9,Children,0.335713


## Evaluate predictive ability

In [37]:
logreg.score(X_test,y_test)

0.8142857142857143

In [38]:
# probability of absenteeism
logreg.predict_proba(X_test)[:,1]

array([0.70931504, 0.36840851, 0.20969843, 0.27811817, 0.24445625,
       0.83131672, 0.27811817, 0.20969843, 0.55075616, 0.13751536,
       0.71734535, 0.11685597, 0.30539132, 0.27811817, 0.37712774,
       0.25140961, 0.87406229, 0.35442245, 0.5780707 , 0.2633942 ,
       0.35105371, 0.21796366, 0.79259952, 0.18222722, 0.15133057,
       0.51649505, 0.39480189, 0.27233514, 0.24349266, 0.65434932,
       0.18785062, 0.32143849, 0.83954126, 0.25140961, 0.59632862,
       0.2159457 , 0.2633942 , 0.32617975, 0.27224937, 0.34688817,
       0.18785062, 0.43928863, 0.21796366, 0.74681877, 0.18785062,
       0.21796366, 0.11492724, 0.69446973, 0.74532816, 0.95529567,
       0.36338446, 0.26376998, 0.77062376, 0.26492361, 0.71532859,
       0.8100825 , 0.7646098 , 0.24081575, 0.84933624, 0.84003734,
       0.06116952, 0.2159457 , 0.56451789, 0.19447098, 0.30539132,
       0.21796366, 0.21796366, 0.18785062, 0.18785062, 0.90624999,
       0.75577713, 0.85018817, 0.29719187, 0.84003734, 0.65304

# Save the model

In [39]:
import pickle

In [40]:
with open('../absenteeism/model','wb') as file:
    pickle.dump(logreg,file)


In [None]:
with open('../absenteeism/model','rb') as model_file, open('../absenteeism/scaler', 'rb') as scaler_file:
    model = pickle.load(model_file)