In [1]:
import pandas as pd
import numpy as np

# Load the data

In [2]:
df_preproc = pd.read_csv('../absenteeism/data/Preprocessed_absenteeism_data.csv')

In [3]:
df_preproc.head()

Unnamed: 0,reason 1,reason 2,reason 3,reason 4,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Month,Day of the week
0,0,0,0,1,289,36,33,239.554,30,0,2,1,4,7,1
1,0,0,0,0,118,13,50,239.554,31,0,1,0,0,7,1
2,0,0,0,1,179,51,38,239.554,31,0,0,0,2,7,2
3,1,0,0,0,279,5,39,239.554,24,0,2,0,4,7,3
4,0,0,0,1,289,36,33,239.554,30,0,2,1,2,7,3


# Target definition

In [4]:
df_preproc['Absenteeism Time in Hours'].median()

3.0

In [5]:
df_preproc['Absenteeism'] = np.where(df_preproc['Absenteeism Time in Hours'] > df_preproc['Absenteeism Time in Hours'].median(), 1, 0)

In [6]:
df_preproc['Absenteeism'].sum()/df_preproc.shape[0]

0.45571428571428574

In [7]:
df_preproc.drop(columns ='Absenteeism Time in Hours', inplace = True )

In [8]:
df_preproc

Unnamed: 0,reason 1,reason 2,reason 3,reason 4,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Month,Day of the week,Absenteeism
0,0,0,0,1,289,36,33,239.554,30,0,2,1,7,1,1
1,0,0,0,0,118,13,50,239.554,31,0,1,0,7,1,0
2,0,0,0,1,179,51,38,239.554,31,0,0,0,7,2,0
3,1,0,0,0,279,5,39,239.554,24,0,2,0,7,3,1
4,0,0,0,1,289,36,33,239.554,30,0,2,1,7,3,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,179,22,40,237.656,22,1,2,0,5,2,1
696,1,0,0,0,225,26,28,237.656,24,0,1,2,5,2,0
697,1,0,0,0,330,16,28,237.656,25,1,0,0,5,3,1
698,0,0,0,1,235,16,32,237.656,25,1,0,0,5,3,0


# Feature and target selection

In [9]:
df_preproc.shape

(700, 15)

In [10]:
X = df_preproc.drop(columns='Absenteeism')

In [11]:
# Reverse elimination
X = X.drop(['Day of the week','Daily Work Load Average','Distance to Work'],axis=1)

In [12]:
X.columns

Index(['reason 1', 'reason 2', 'reason 3', 'reason 4',
       'Transportation Expense', 'Age', 'Body Mass Index', 'Education',
       'Children', 'Pets', 'Month'],
      dtype='object')

In [13]:
X.head()

Unnamed: 0,reason 1,reason 2,reason 3,reason 4,Transportation Expense,Age,Body Mass Index,Education,Children,Pets,Month
0,0,0,0,1,289,33,30,0,2,1,7
1,0,0,0,0,118,50,31,0,1,0,7
2,0,0,0,1,179,38,31,0,0,0,7
3,1,0,0,0,279,39,24,0,2,0,7
4,0,0,0,1,289,33,30,0,2,1,7


In [14]:
y = df_preproc['Absenteeism']

# Standardize numerical features

In [15]:
X.columns

Index(['reason 1', 'reason 2', 'reason 3', 'reason 4',
       'Transportation Expense', 'Age', 'Body Mass Index', 'Education',
       'Children', 'Pets', 'Month'],
      dtype='object')

In [16]:
X_unscaled = X[['Transportation Expense', 'Age',
       'Body Mass Index', 'Children',
       'Pets', 'Month']]

In [17]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(X_unscaled)

StandardScaler()

In [18]:
import pickle
with open('../absenteeism/scaler','wb') as file:
    pickle.dump(scaler,file)

In [19]:
with open('../absenteeism/scaler', 'rb') as scaler_file:

    abs_scaler = pickle.load(scaler_file)

In [20]:
X_scaled = abs_scaler.transform(X_unscaled)

In [21]:
X = X.drop(columns = ['Transportation Expense', 'Age',
       'Body Mass Index', 'Children',
       'Pets', 'Month'])

In [22]:
X[['Transportation Expense', 'Age',
       'Body Mass Index', 'Children',
       'Pets', 'Month']] = X_scaled
X.columns

Index(['reason 1', 'reason 2', 'reason 3', 'reason 4', 'Education',
       'Transportation Expense', 'Age', 'Body Mass Index', 'Children', 'Pets',
       'Month'],
      dtype='object')

# Train test split

In [23]:
from sklearn.model_selection import train_test_split

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X,y, train_size = 0.8)

# Logistic regression

In [25]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

## Training the model

In [26]:
logreg = LogisticRegression()

In [27]:
logreg.fit(X_train,y_train)

LogisticRegression()

In [28]:
logreg.score(X_train, y_train)

0.7767857142857143

## Analyze coefficients

In [29]:
logreg.intercept_

array([-1.59430706])

In [30]:
logreg.coef_

array([[ 2.66327249,  1.28303056,  3.0025583 ,  0.65192086, -0.1940472 ,
         0.56924785, -0.02678926,  0.16885538,  0.33574754, -0.3949451 ,
         0.14805927]])

In [31]:
feature_names = X.columns.values

In [32]:
summary_table = pd.DataFrame (columns = ['Feature name'], data = feature_names)
summary_table['Coefficient'] = np.transpose(logreg.coef_)

In [33]:
summary_table

Unnamed: 0,Feature name,Coefficient
0,reason 1,2.663272
1,reason 2,1.283031
2,reason 3,3.002558
3,reason 4,0.651921
4,Education,-0.194047
5,Transportation Expense,0.569248
6,Age,-0.026789
7,Body Mass Index,0.168855
8,Children,0.335748
9,Pets,-0.394945


In [34]:
summary_table.index = summary_table.index + 1
summary_table.loc[0] = ['Intercept', logreg.intercept_[0]]

In [35]:
summary_table.sort_index(inplace = True)
summary_table

Unnamed: 0,Feature name,Coefficient
0,Intercept,-1.594307
1,reason 1,2.663272
2,reason 2,1.283031
3,reason 3,3.002558
4,reason 4,0.651921
5,Education,-0.194047
6,Transportation Expense,0.569248
7,Age,-0.026789
8,Body Mass Index,0.168855
9,Children,0.335748


## Evaluate predictive ability

In [36]:
logreg.score(X_test,y_test)

0.8357142857142857

In [37]:
# probability of absenteeism
logreg.predict_proba(X_test)[:,1]

array([0.20265059, 0.09670133, 0.54281343, 0.23041724, 0.83292589,
       0.12297591, 0.11502066, 0.83656988, 0.2627681 , 0.19590586,
       0.19964227, 0.45716629, 0.91724519, 0.49156118, 0.76713246,
       0.91557403, 0.63679328, 0.22300683, 0.45994212, 0.7653316 ,
       0.69382629, 0.83581842, 0.86969713, 0.2950572 , 0.79525139,
       0.2627681 , 0.23134713, 0.60466555, 0.83093813, 0.45553697,
       0.56643579, 0.95502608, 0.56445788, 0.18647546, 0.13708944,
       0.82388497, 0.90155177, 0.41264407, 0.30695058, 0.43627331,
       0.84780258, 0.40243776, 0.44669626, 0.39634938, 0.16406124,
       0.31745953, 0.70117594, 0.5011333 , 0.82831914, 0.10974857,
       0.19590586, 0.75949581, 0.11939413, 0.20265059, 0.20294393,
       0.20294393, 0.19590586, 0.29609791, 0.19590586, 0.64548917,
       0.57455007, 0.90675202, 0.16988983, 0.19508227, 0.10974857,
       0.49156118, 0.23041724, 0.20265059, 0.12760824, 0.72705286,
       0.43627331, 0.87991657, 0.31502861, 0.22946663, 0.32564

# Save the model

In [38]:
import pickle

In [39]:
with open('../absenteeism/model','wb') as file:
    pickle.dump(logreg,file)


In [40]:
with open('../absenteeism/model','rb') as model_file, open('../absenteeism/scaler', 'rb') as scaler_file:
    model = pickle.load(model_file)