# Create a Logistic Regression to predict absenteeism

## 1. Import the relevant libraries

In [241]:
import numpy as np
import pandas as pd

# 2. Load the data

In [242]:
data_preprocessed = pd.read_csv('Absenteeism_preprocessed.csv')

In [243]:
data_preprocessed

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Montth Value,Day of the Week
0,0,0,0,1,2015-07-07,289,36,33,239.554,30,0,2,1,4,7,1
1,0,0,0,0,2015-07-14,118,13,50,239.554,31,0,1,0,0,7,1
2,0,0,0,1,2015-07-15,179,51,38,239.554,31,0,0,0,2,7,2
3,1,0,0,0,2015-07-16,279,5,39,239.554,24,0,2,0,4,7,3
4,0,0,0,1,2015-07-23,289,36,33,239.554,30,0,2,1,2,7,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,2018-05-23,179,22,40,237.656,22,1,2,0,8,5,2
696,1,0,0,0,2018-05-23,225,26,28,237.656,24,0,1,2,3,5,2
697,1,0,0,0,2018-05-24,330,16,28,237.656,25,1,0,0,8,5,3
698,0,0,0,1,2018-05-24,235,16,32,237.656,25,1,0,0,2,5,3


## 3. Create the Target

In [244]:
data_preprocessed['Absenteeism Time in Hours'].median()

# What are these classes
# Moderately absent (=< 3 hours)
# Excessively absent (>= 4 hours)

3.0

In [245]:
#targets = np.where(data_preprocessed['Absenteeism Time in Hours'] > 3, 1, 0)
targets = np.where(data_preprocessed['Absenteeism Time in Hours'] > 
                   data_preprocessed['Absenteeism Time in Hours'].median(), 1, 0)

In [246]:
data_preprocessed['Excessive Absenteeism'] = targets

In [247]:
data_preprocessed.head(4)

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Montth Value,Day of the Week,Excessive Absenteeism
0,0,0,0,1,2015-07-07,289,36,33,239.554,30,0,2,1,4,7,1,1
1,0,0,0,0,2015-07-14,118,13,50,239.554,31,0,1,0,0,7,1,0
2,0,0,0,1,2015-07-15,179,51,38,239.554,31,0,0,0,2,7,2,0
3,1,0,0,0,2015-07-16,279,5,39,239.554,24,0,2,0,4,7,3,1


In [248]:
### A comment on the targets

In [249]:
targets.sum() / targets.shape[0]

# Around 46% of the targets are 1s and 54% of the targets are 0s
# A balance of 45-55 is almost always sufficient

0.45571428571428574

In [250]:
# Lets drop the Absenteeism Time in Hours since we don't need it anymore
# Drop the Date columns as well

#data_with_targets = data_preprocessed.drop(['Absenteeism Time in Hours'], axis=1)
data_with_targets = data_preprocessed.drop(['Absenteeism Time in Hours','Date'], axis=1)



In [251]:
data_with_targets is data_preprocessed

False

In [252]:
data_with_targets.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Montth Value,Day of the Week,Excessive Absenteeism
0,0,0,0,1,289,36,33,239.554,30,0,2,1,7,1,1
1,0,0,0,0,118,13,50,239.554,31,0,1,0,7,1,0
2,0,0,0,1,179,51,38,239.554,31,0,0,0,7,2,0
3,1,0,0,0,279,5,39,239.554,24,0,2,0,7,3,1
4,0,0,0,1,289,36,33,239.554,30,0,2,1,7,3,0


## 4. Select the inputs for the regression

In [253]:
data_with_targets.shape

(700, 15)

In [254]:
data_with_targets.iloc[:, 0:14]

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Montth Value,Day of the Week
0,0,0,0,1,289,36,33,239.554,30,0,2,1,7,1
1,0,0,0,0,118,13,50,239.554,31,0,1,0,7,1
2,0,0,0,1,179,51,38,239.554,31,0,0,0,7,2
3,1,0,0,0,279,5,39,239.554,24,0,2,0,7,3
4,0,0,0,1,289,36,33,239.554,30,0,2,1,7,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,179,22,40,237.656,22,1,2,0,5,2
696,1,0,0,0,225,26,28,237.656,24,0,1,2,5,2
697,1,0,0,0,330,16,28,237.656,25,1,0,0,5,3
698,0,0,0,1,235,16,32,237.656,25,1,0,0,5,3


In [255]:
# Or
data_with_targets.iloc[:,:-1]

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Montth Value,Day of the Week
0,0,0,0,1,289,36,33,239.554,30,0,2,1,7,1
1,0,0,0,0,118,13,50,239.554,31,0,1,0,7,1
2,0,0,0,1,179,51,38,239.554,31,0,0,0,7,2
3,1,0,0,0,279,5,39,239.554,24,0,2,0,7,3
4,0,0,0,1,289,36,33,239.554,30,0,2,1,7,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,179,22,40,237.656,22,1,2,0,5,2
696,1,0,0,0,225,26,28,237.656,24,0,1,2,5,2
697,1,0,0,0,330,16,28,237.656,25,1,0,0,5,3
698,0,0,0,1,235,16,32,237.656,25,1,0,0,5,3


In [256]:
# Let's create a new variable for the inputs
unscaled_inputs = data_with_targets.iloc[:,:-1]

## 5. Standardize the data

In [257]:
from sklearn.preprocessing import StandardScaler

In [258]:
absenteeism_scaler = StandardScaler()
absenteeism_scaler.fit(unscaled_inputs) 

In [259]:
scaled_inputs = absenteeism_scaler.transform(unscaled_inputs)

In [260]:
scaled_inputs

array([[-0.57735027, -0.09298136, -0.31448545, ...,  0.26848661,
         0.18272635, -0.68370352],
       [-0.57735027, -0.09298136, -0.31448545, ..., -0.58968976,
         0.18272635, -0.68370352],
       [-0.57735027, -0.09298136, -0.31448545, ..., -0.58968976,
         0.18272635, -0.00772546],
       ...,
       [ 1.73205081, -0.09298136, -0.31448545, ..., -0.58968976,
        -0.3882935 ,  0.66825259],
       [-0.57735027, -0.09298136, -0.31448545, ..., -0.58968976,
        -0.3882935 ,  0.66825259],
       [-0.57735027, -0.09298136, -0.31448545, ...,  0.26848661,
        -0.3882935 ,  0.66825259]])

In [261]:
scaled_inputs.shape

(700, 14)

## 6. Splitting the Data for Training and Testing

In [262]:
# Split the data into train & test and shuffle

from sklearn.model_selection import train_test_split

In [263]:
# Split
train_test_split(scaled_inputs, targets)

[array([[-0.57735027, -0.09298136,  3.17979734, ..., -0.58968976,
         -0.3882935 , -0.68370352],
        [ 1.73205081, -0.09298136, -0.31448545, ...,  0.26848661,
         -0.3882935 , -0.00772546],
        [-0.57735027, -0.09298136, -0.31448545, ...,  0.26848661,
          1.32476605, -0.00772546],
        ...,
        [-0.57735027, -0.09298136, -0.31448545, ..., -0.58968976,
          1.03925612,  2.0202087 ],
        [-0.57735027, -0.09298136, -0.31448545, ..., -0.58968976,
          1.03925612,  2.0202087 ],
        [-0.57735027, -0.09298136, -0.31448545, ...,  1.12666297,
          1.61027597,  1.34423065]]),
 array([[-0.57735027, -0.09298136, -0.31448545, ...,  0.26848661,
          0.46823627,  0.66825259],
        [ 1.73205081, -0.09298136, -0.31448545, ..., -0.58968976,
         -0.67380342, -0.00772546],
        [-0.57735027, -0.09298136, -0.31448545, ..., -0.58968976,
         -0.95931334,  0.66825259],
        ...,
        [-0.57735027, -0.09298136, -0.31448545, ..., -

In [264]:
x_train, x_test, y_train, y_test = train_test_split(scaled_inputs, targets, train_size=0.8)
# train_size = 0.8 means 80% of the data will be used for training and 20% for testing

In [265]:
print(x_train.shape, y_train.shape)

(560, 14) (560,)


In [266]:
print(x_test.shape, y_test.shape)

(140, 14) (140,)


In [267]:
print(" x_train: ", x_train.shape)
print(" y_train: ", y_train.shape)
print(" x_test: ", x_test.shape)
print(" y_test: ", y_test.shape)

 x_train:  (560, 14)
 y_train:  (560,)
 x_test:  (140, 14)
 y_test:  (140,)


# Machine Learning Models

## 1. Logistic Regression

In [268]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

### 1.1 Training the logistic model

In [269]:
reg = LogisticRegression()

In [270]:
reg.fit(x_train, y_train)

In [271]:
reg.score(x_train, y_train)

0.7660714285714286

### 1.2 Manually check the accuracy

In [272]:
model_outputs = reg.predict(x_train)
model_outputs
# These are predictions for the model or outputs

array([1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0,
       1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1,
       1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0,
       1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1,
       0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1,
       0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0,
       1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0,
       1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0,
       0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0,
       0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1,
       0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,

In [273]:
y_train
# These are our targets fron the Data

array([1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0,
       1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1,
       0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1,
       1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1,
       0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1,
       1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0,
       0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0,
       1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1,
       0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1,
       0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0,

In [274]:
model_outputs == y_train
# True = 1
# False = 0

array([ True,  True,  True,  True, False,  True,  True,  True,  True,
       False,  True,  True,  True, False,  True,  True, False,  True,
        True,  True,  True,  True,  True, False,  True,  True,  True,
        True,  True,  True,  True,  True,  True, False, False,  True,
        True,  True,  True,  True,  True,  True,  True,  True, False,
       False,  True,  True,  True,  True,  True,  True,  True, False,
        True, False,  True,  True, False,  True, False,  True,  True,
        True,  True, False,  True,  True,  True, False,  True,  True,
        True,  True, False,  True, False,  True, False,  True,  True,
        True, False, False,  True,  True,  True,  True,  True,  True,
        True, False, False, False, False,  True,  True,  True,  True,
        True, False,  True, False,  True, False,  True,  True,  True,
        True,  True, False,  True,  True,  True, False,  True,  True,
        True,  True, False, False, False, False,  True, False,  True,
        True,  True,

In [275]:
np.sum(model_outputs==y_train)
# is the total number of correct predictions (True entries)

429

In [276]:
# Observations
model_outputs.shape[0]

560

In [277]:
# Acuracy = Correct predictions / # observations
Accuracy = np.sum(model_outputs==y_train) / model_outputs.shape[0]

In [278]:
Accuracy

0.7660714285714286

## 2. Logistic Regression with Hyper parameters Optimization

In [279]:
import joblib
from sklearn.model_selection import GridSearchCV
import warnings

warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=DeprecationWarning)

In [280]:
def print_results(results):
    print("BEST PARAMS: {}\n".format(results.best_params_))
    
    means = results.cv_results_['mean_test_score']
    stds = results.cv_results_['std_test_score']
    
    for mean, std, params in zip(means, stds, results.cv_results_['params']):
        print('{} (+/--{}) for {}'.format(round(mean, 3), round(std * 2, 3), params))

In [281]:
lr = LogisticRegression()

parameters = {
    'C':[0.001, 0.01, 0.1, 1, 10]
}

# Cross Validation : 5 kfolds
cv = GridSearchCV(lr, parameters, cv=5)

cv.fit(x_train, y_train)

print_results(cv)

BEST PARAMS: {'C': 1}

0.627 (+/--0.058) for {'C': 0.001}
0.711 (+/--0.07) for {'C': 0.01}
0.736 (+/--0.075) for {'C': 0.1}
0.75 (+/--0.102) for {'C': 1}
0.746 (+/--0.102) for {'C': 10}


In [282]:
# Check the best parameter
cv.best_estimator_

In [283]:
# Write Out picked model
joblib.dump(cv.best_estimator_, 'LR_model.pkl')

['LR_model.pkl']