# Applying Machine Learning to the Preprocessed Data

## Creating a Logistic Regression to Predict Absenteeism

### Import the relevant libaries

In [1]:
import numpy as np
import pandas as pd

### Load the data

In [2]:
data_preprocessed = pd.read_csv("Absenteeism_preprocessed.csv")

In [3]:
data_preprocessed.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pet,Absenteeism Time in Hours
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,2
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,4
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,2


=================================================================================================================================================

## Creating the Targets for the Regression

In [4]:
data_preprocessed['Absenteeism Time in Hours'].median()

3.0

* `np.where(condition, value if True, value if False)` checks if a condition has been satisfied and assigns a value accordingly

In [5]:
targets = np.where(data_preprocessed['Absenteeism Time in Hours'] > 3, 1, 0)

In [6]:
targets

array([1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0,
       1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1,
       1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0,
       0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0,
       0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0,

In [7]:
targets = np.where(data_preprocessed['Absenteeism Time in Hours'] > data_preprocessed['Absenteeism Time in Hours'].median(), 1, 0)

In [8]:
targets

array([1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0,
       1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1,
       1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0,
       0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0,
       0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0,

In [9]:
data_preprocessed['Excessive Absenteeism'] = targets

In [10]:
data_preprocessed.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pet,Absenteeism Time in Hours,Excessive Absenteeism
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4,1
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,2,0
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,4,1
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,2,0


### A comment on the targets

In [12]:
targets.sum() / targets.shape[0]

0.45571428571428574

In [13]:
data_with_targets = data_preprocessed.drop(['Absenteeism Time in Hours'], axis=1)

In [14]:
data_with_targets is data_preprocessed

False

In [15]:
data_with_targets.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pet,Excessive Absenteeism
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,1
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,0
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,1
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,0


=================================================================================================================================================

## Selecting the Inputs for the Regression

In [16]:
data_with_targets.shape

(700, 15)

* `DataFrame.iloc[row indices, column indices]` selects (slices) data by position when given rows and columns wanted

In [18]:
data_with_targets.iloc[:, :14]

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pet
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,5,2,179,22,40,237.656,22,1,2,0
696,1,0,0,0,5,2,225,26,28,237.656,24,0,1,2
697,1,0,0,0,5,3,330,16,28,237.656,25,1,0,0
698,0,0,0,1,5,3,235,16,32,237.656,25,1,0,0


In [19]:
data_with_targets.iloc[:, :-1]

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pet
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,5,2,179,22,40,237.656,22,1,2,0
696,1,0,0,0,5,2,225,26,28,237.656,24,0,1,2
697,1,0,0,0,5,3,330,16,28,237.656,25,1,0,0
698,0,0,0,1,5,3,235,16,32,237.656,25,1,0,0


In [20]:
unscaled_inputs = data_with_targets.iloc[:, :-1]

=================================================================================================================================================

## Standardizing the dataset for better results

In [21]:
from sklearn.preprocessing import StandardScaler

absenteeism_scaler = StandardScaler()

In [22]:
absenteeism_scaler.fit(unscaled_inputs)

In [23]:
scaled_inputs = absenteeism_scaler.transform(unscaled_inputs)

In [24]:
scaled_inputs

array([[-0.57735027, -0.09298136, -0.31448545, ..., -0.44798003,
         0.88046927,  0.26848661],
       [-0.57735027, -0.09298136, -0.31448545, ..., -0.44798003,
        -0.01928035, -0.58968976],
       [-0.57735027, -0.09298136, -0.31448545, ..., -0.44798003,
        -0.91902997, -0.58968976],
       ...,
       [ 1.73205081, -0.09298136, -0.31448545, ...,  2.23224237,
        -0.91902997, -0.58968976],
       [-0.57735027, -0.09298136, -0.31448545, ...,  2.23224237,
        -0.91902997, -0.58968976],
       [-0.57735027, -0.09298136, -0.31448545, ..., -0.44798003,
        -0.01928035,  0.26848661]])

In [25]:
scaled_inputs.shape

(700, 14)

=================================================================================================================================================

## Train-Test Split

### Import the relevant module

In [26]:
from sklearn.model_selection import train_test_split

### Split

In [None]:
train_test_split(scaled_inputs, targets)

[array([[ 1.73205081, -0.09298136, -0.31448545, ..., -0.44798003,
         -0.91902997, -0.58968976],
        [-0.57735027, -0.09298136, -0.31448545, ...,  2.23224237,
         -0.91902997, -0.58968976],
        [-0.57735027, -0.09298136, -0.31448545, ..., -0.44798003,
          0.88046927,  0.26848661],
        ...,
        [ 1.73205081, -0.09298136, -0.31448545, ..., -0.44798003,
          0.88046927,  0.26848661],
        [-0.57735027, -0.09298136, -0.31448545, ..., -0.44798003,
         -0.91902997, -0.58968976],
        [ 1.73205081, -0.09298136, -0.31448545, ...,  2.23224237,
         -0.91902997, -0.58968976]]),
 array([[ 1.73205081, -0.09298136, -0.31448545, ..., -0.44798003,
         -0.91902997, -0.58968976],
        [-0.57735027, -0.09298136, -0.31448545, ..., -0.44798003,
         -0.91902997, -0.58968976],
        [-0.57735027, -0.09298136, -0.31448545, ..., -0.44798003,
         -0.91902997, -0.58968976],
        ...,
        [-0.57735027, -0.09298136, -0.31448545, ..., -

In [28]:
X_train, X_test, y_train, y_test = train_test_split(scaled_inputs, targets)

In [29]:
print(X_train.shape, y_train.shape)

(525, 14) (525,)


In [30]:
print(X_test.shape, y_test.shape)

(175, 14) (175,)


In [31]:
X_train, X_test, y_train, y_test = train_test_split(scaled_inputs, targets, train_size=0.8, random_state=20)

In [32]:
print(X_train.shape, y_train.shape)

(560, 14) (560,)


In [33]:
print(X_test.shape, y_test.shape)

(140, 14) (140,)


=================================================================================================================================================

## Training and Evaluating the Model

### Logistic Regression with sklearn

In [34]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score

### Training the model

In [35]:
reg = LogisticRegression()

In [36]:
reg.fit(X_train, y_train)

In [37]:
reg.score(X_train, y_train)

0.7803571428571429

### Manually check the accuracy

In [38]:
model_outputs = reg.predict(X_train)
model_outputs

array([0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0,
       0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0,
       1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1,
       1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1,
       0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0,

In [39]:
y_train

array([0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0,
       1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1,
       1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1,
       1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1,
       1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0,

In [40]:
model_outputs == y_train

array([ True,  True, False,  True,  True,  True,  True,  True,  True,
        True, False,  True, False, False,  True,  True,  True,  True,
       False,  True, False,  True, False, False,  True,  True,  True,
       False,  True,  True,  True,  True,  True,  True,  True,  True,
       False, False, False, False,  True, False,  True,  True, False,
        True,  True,  True,  True,  True, False,  True,  True,  True,
        True,  True,  True,  True,  True, False,  True,  True,  True,
        True,  True,  True,  True, False,  True, False,  True,  True,
        True,  True,  True, False,  True,  True,  True,  True,  True,
       False,  True, False,  True,  True, False, False, False,  True,
        True,  True,  True,  True,  True,  True,  True, False,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True, False,  True,  True,  True,  True,
        True,  True,  True,  True,  True, False,  True,  True,  True,
        True,  True,

In [41]:
np.sum((model_outputs == y_train))

437

In [42]:
model_outputs.shape[0]

560

In [43]:
np.sum((model_outputs == y_train)) / model_outputs.shape[0]

0.7803571428571429

=================================================================================================================================================

## Extracting the Intercept and Coefficients

In [44]:
reg.intercept_

array([-0.21102398])

In [45]:
reg.coef_

array([[ 2.06986858,  0.33028689,  1.56236468,  1.31054482,  0.02588386,
        -0.08649155,  0.72352943, -0.06072562, -0.2052754 , -0.02828267,
         0.3257168 , -0.16073453,  0.3815691 , -0.32129179]])

In [46]:
scaled_inputs.columns.values

AttributeError: 'numpy.ndarray' object has no attribute 'columns'

In [47]:
unscaled_inputs.columns.values

array(['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Month Value',
       'Day of the Week', 'Transportation Expense', 'Distance to Work',
       'Age', 'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pet'], dtype=object)

In [48]:
feature_names = unscaled_inputs.columns.values

In [49]:
summary_table = pd.DataFrame(columns=['Feature Name'], data = feature_names)

summary_table['Coefficient'] = np.transpose(reg.coef_)

summary_table

Unnamed: 0,Feature Name,Coefficient
0,Reason_1,2.069869
1,Reason_2,0.330287
2,Reason_3,1.562365
3,Reason_4,1.310545
4,Month Value,0.025884
5,Day of the Week,-0.086492
6,Transportation Expense,0.723529
7,Distance to Work,-0.060726
8,Age,-0.205275
9,Daily Work Load Average,-0.028283


In [50]:
summary_table.index = summary_table.index + 1
summary_table.loc[0] = ['Intercept', reg.intercept_[0]]
summary_table = summary_table.sort_index()
summary_table

Unnamed: 0,Feature Name,Coefficient
0,Intercept,-0.211024
1,Reason_1,2.069869
2,Reason_2,0.330287
3,Reason_3,1.562365
4,Reason_4,1.310545
5,Month Value,0.025884
6,Day of the Week,-0.086492
7,Transportation Expense,0.723529
8,Distance to Work,-0.060726
9,Age,-0.205275


=================================================================================================================================================

## Interpreting the Coefficients

In [51]:
summary_table['Odds_ratio'] = np.exp(summary_table.Coefficient)

In [52]:
summary_table

Unnamed: 0,Feature Name,Coefficient,Odds_ratio
0,Intercept,-0.211024,0.809755
1,Reason_1,2.069869,7.923782
2,Reason_2,0.330287,1.391367
3,Reason_3,1.562365,4.770088
4,Reason_4,1.310545,3.708193
5,Month Value,0.025884,1.026222
6,Day of the Week,-0.086492,0.917143
7,Transportation Expense,0.723529,2.061697
8,Distance to Work,-0.060726,0.941081
9,Age,-0.205275,0.814423


In [54]:
summary_table.sort_values('Odds_ratio', ascending=False)

Unnamed: 0,Feature Name,Coefficient,Odds_ratio
1,Reason_1,2.069869,7.923782
3,Reason_3,1.562365,4.770088
4,Reason_4,1.310545,3.708193
7,Transportation Expense,0.723529,2.061697
13,Children,0.381569,1.464581
2,Reason_2,0.330287,1.391367
11,Body Mass Index,0.325717,1.385023
5,Month Value,0.025884,1.026222
10,Daily Work Load Average,-0.028283,0.972114
8,Distance to Work,-0.060726,0.941081


* A feature is not particularly important:
    * if its coefficient is around `0`
    * if its odds ratio is around `1`

* A weight (coefficient) of `0` implies that no matter the feature value, we will multiply it by `0` (in the model)

* For a unit change in the standardized feature, the odds increase by a multiple equal to the odds ratio (`1` = no change)

<center>
    <table>
        <tr>
            <th>ODDS</th>
            <th>X</th>
            <th>ODDS RATIO</th>
            <th>=</th>
            <th>NEW ODDS</th>
        </tr>
        <tr>
            <td>5:1</td>
            <td>X</td>
            <td>2</td>
            <td>=</td>
            <td>10:1</td>
        </tr>
        <tr>
            <td>5:1</td>
            <td>X</td>
            <td>0.2</td>
            <td>=</td>
            <td>1:1</td>
        </tr>
        <tr>
            <td>5:1</td>
            <td>X</td>
            <td>1</td>
            <td>=</td>
            <td>5:1</td>
        </tr>
    </table>
</center>

=================================================================================================================================================

## Creating a Custom Scaler to Standardize Only Numerical Features

In [55]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler

In [69]:
class CustomScaler(BaseEstimator, TransformerMixin):
    def __init__(self, columns, copy = True, with_mean = True, with_std = True):
        self.scaler = StandardScaler(copy=copy, with_mean=with_mean, with_std=with_std)
        self.columns = columns
        self.copy = copy
        self.with_mean = with_mean
        self.with_std = with_std
    
    def fit(self, X, y = None):
        self.scaler.fit(X[self.columns], y)
        self.mean_ = np.mean(X[self.columns])
        self.var_ = np.var(X[self.columns])
        return self
    
    def transform(self, X, y = None, copy = None):
        init_col_order = X.columns
        X_scaled = pd.DataFrame(self.scaler.transform(X[self.columns]), columns=self.columns)
        X_not_scaled = X.loc[:, ~X.columns.isin(self.columns)]
        return pd.concat([X_not_scaled, X_scaled], axis=1)[init_col_order]

In [70]:
unscaled_inputs.columns.values

array(['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Month Value',
       'Day of the Week', 'Transportation Expense', 'Distance to Work',
       'Age', 'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pet'], dtype=object)

In [71]:
columns_to_scale = ['Month Value', 'Day of the Week', 'Transportation Expense', 'Distance to Work', 'Age', 'Daily Work Load Average', 'Body Mass Index', 'Children', 'Pet']

In [72]:
absenteeism_scaler = CustomScaler(columns_to_scale)

In [73]:
absenteeism_scaler.fit(unscaled_inputs)

  return var(axis=axis, dtype=dtype, out=out, ddof=ddof, **kwargs)


In [74]:
scaled_inputs = absenteeism_scaler.transform(unscaled_inputs)

In [75]:
scaled_inputs

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pet
0,0,0,0,1,0.030796,-0.800950,1.005844,0.412816,-0.536062,-0.806331,0.767431,0,0.880469,0.268487
1,0,0,0,0,0.030796,-0.800950,-1.574681,-1.141882,2.130803,-0.806331,1.002633,0,-0.019280,-0.589690
2,0,0,0,1,0.030796,-0.232900,-0.654143,1.426749,0.248310,-0.806331,1.002633,0,-0.919030,-0.589690
3,1,0,0,0,0.030796,0.335149,0.854936,-1.682647,0.405184,-0.806331,-0.643782,0,0.880469,-0.589690
4,0,0,0,1,0.030796,0.335149,1.005844,0.412816,-0.536062,-0.806331,0.767431,0,0.880469,0.268487
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,-0.568019,-0.232900,-0.654143,-0.533522,0.562059,-0.853789,-1.114186,1,0.880469,-0.589690
696,1,0,0,0,-0.568019,-0.232900,0.040034,-0.263140,-1.320435,-0.853789,-0.643782,0,-0.019280,1.126663
697,1,0,0,0,-0.568019,0.335149,1.624567,-0.939096,-1.320435,-0.853789,-0.408580,1,-0.919030,-0.589690
698,0,0,0,1,-0.568019,0.335149,0.190942,-0.939096,-0.692937,-0.853789,-0.408580,1,-0.919030,-0.589690


### Split

In [76]:
train_test_split(scaled_inputs, targets)

[     Reason_1  Reason_2  Reason_3  Reason_4  Month Value  Day of the Week  \
 231         1         0         0         0     0.330204         1.471248   
 644         0         0         0         1    -1.166834         0.903199   
 481         0         0         0         1     0.030796         1.471248   
 271         0         0         0         1     0.629611         0.903199   
 668         0         1         0         0    -0.867426        -0.800950   
 ..        ...       ...       ...       ...          ...              ...   
 21          1         0         0         0    -1.166834         2.039298   
 373         0         0         0         1     0.929019        -1.368999   
 440         0         0         0         1    -0.568019        -0.232900   
 191         1         0         0         0    -0.268611         1.471248   
 250         0         0         1         0     0.030796         0.335149   
 
      Transportation Expense  Distance to Work       Age  \
 2

In [77]:
X_train, X_test, y_train, y_test = train_test_split(scaled_inputs, targets)

In [78]:
print(X_train.shape, y_train.shape)

(525, 14) (525,)


In [79]:
print(X_test.shape, y_test.shape)

(175, 14) (175,)


In [80]:
X_train, X_test, y_train, y_test = train_test_split(scaled_inputs, targets, train_size=0.8, random_state=20)

In [81]:
print(X_train.shape, y_train.shape)

(560, 14) (560,)


In [82]:
print(X_test.shape, y_test.shape)

(140, 14) (140,)


### Logistic Regression with sklearn

In [83]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score

### Training the model

In [84]:
reg = LogisticRegression()

In [85]:
reg.fit(X_train, y_train)

In [86]:
reg.score(X_train, y_train)

0.7660714285714286

### Manually check the accuracy

In [87]:
model_outputs = reg.predict(X_train)
model_outputs

array([0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0,
       0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0,
       1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1,
       0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0,

In [88]:
y_train

array([0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0,
       1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1,
       1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1,
       1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1,
       1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0,

In [89]:
model_outputs == y_train

array([ True,  True, False,  True,  True,  True,  True,  True,  True,
        True, False,  True, False, False,  True,  True,  True,  True,
       False,  True, False,  True, False, False,  True,  True,  True,
       False,  True,  True,  True,  True,  True,  True,  True,  True,
       False, False, False, False,  True, False,  True,  True, False,
        True,  True,  True,  True,  True, False,  True,  True,  True,
        True,  True,  True,  True,  True, False,  True,  True,  True,
        True,  True,  True,  True, False,  True, False,  True,  True,
        True,  True,  True, False,  True,  True,  True,  True,  True,
       False,  True, False,  True,  True, False, False, False,  True,
        True,  True,  True,  True,  True,  True,  True, False,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True, False,  True,  True,  True,  True,
       False,  True,  True,  True,  True, False,  True,  True,  True,
        True,  True,

In [90]:
np.sum((model_outputs == y_train))

429

In [91]:
model_outputs.shape[0]

560

In [92]:
np.sum((model_outputs == y_train)) / model_outputs.shape[0]

0.7660714285714286

### Extracting the Intercept and Coefficients

In [93]:
reg.intercept_

array([-1.6078758])

In [94]:
reg.coef_

array([[ 2.78557668,  0.92172133,  3.09972239,  0.81492433,  0.01372847,
        -0.07603813,  0.62697633, -0.02962732, -0.17353389, -0.02268081,
         0.28182588, -0.26881295,  0.36067552, -0.27421928]])

In [95]:
scaled_inputs.columns.values

array(['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Month Value',
       'Day of the Week', 'Transportation Expense', 'Distance to Work',
       'Age', 'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pet'], dtype=object)

In [96]:
unscaled_inputs.columns.values

array(['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Month Value',
       'Day of the Week', 'Transportation Expense', 'Distance to Work',
       'Age', 'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pet'], dtype=object)

In [97]:
feature_names = unscaled_inputs.columns.values

In [98]:
summary_table = pd.DataFrame(columns=['Feature Name'], data = feature_names)

summary_table['Coefficient'] = np.transpose(reg.coef_)

summary_table

Unnamed: 0,Feature Name,Coefficient
0,Reason_1,2.785577
1,Reason_2,0.921721
2,Reason_3,3.099722
3,Reason_4,0.814924
4,Month Value,0.013728
5,Day of the Week,-0.076038
6,Transportation Expense,0.626976
7,Distance to Work,-0.029627
8,Age,-0.173534
9,Daily Work Load Average,-0.022681


In [99]:
summary_table.index = summary_table.index + 1
summary_table.loc[0] = ['Intercept', reg.intercept_[0]]
summary_table = summary_table.sort_index()
summary_table

Unnamed: 0,Feature Name,Coefficient
0,Intercept,-1.607876
1,Reason_1,2.785577
2,Reason_2,0.921721
3,Reason_3,3.099722
4,Reason_4,0.814924
5,Month Value,0.013728
6,Day of the Week,-0.076038
7,Transportation Expense,0.626976
8,Distance to Work,-0.029627
9,Age,-0.173534


### Interpreting the Coefficients

In [100]:
summary_table['Odds_ratio'] = np.exp(summary_table.Coefficient)

In [101]:
summary_table

Unnamed: 0,Feature Name,Coefficient,Odds_ratio
0,Intercept,-1.607876,0.200313
1,Reason_1,2.785577,16.209163
2,Reason_2,0.921721,2.513613
3,Reason_3,3.099722,22.19179
4,Reason_4,0.814924,2.259005
5,Month Value,0.013728,1.013823
6,Day of the Week,-0.076038,0.926781
7,Transportation Expense,0.626976,1.871942
8,Distance to Work,-0.029627,0.970807
9,Age,-0.173534,0.840689


In [102]:
summary_table.sort_values('Odds_ratio', ascending=False)

Unnamed: 0,Feature Name,Coefficient,Odds_ratio
3,Reason_3,3.099722,22.19179
1,Reason_1,2.785577,16.209163
2,Reason_2,0.921721,2.513613
4,Reason_4,0.814924,2.259005
7,Transportation Expense,0.626976,1.871942
13,Children,0.360676,1.434298
11,Body Mass Index,0.281826,1.325548
5,Month Value,0.013728,1.013823
10,Daily Work Load Average,-0.022681,0.977574
8,Distance to Work,-0.029627,0.970807


=================================================================================================================================================

## Simplifying the Model (Backward Elimination)

### Backward Elimination

* The idea is that we can simplify our model by removing all features which have close to no contribution to the model.

* When we have the p-values, we get rid of all coefficients with `p-values > 0.05`.

In [103]:
data_with_targets = data_preprocessed.drop(['Absenteeism Time in Hours', 'Day of the Week', 'Daily Work Load Average', 'Distance to Work'], axis=1)

In [104]:
data_with_targets is data_preprocessed

False

In [105]:
data_with_targets.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Transportation Expense,Age,Body Mass Index,Education,Children,Pet,Excessive Absenteeism
0,0,0,0,1,7,289,33,30,0,2,1,1
1,0,0,0,0,7,118,50,31,0,1,0,0
2,0,0,0,1,7,179,38,31,0,0,0,0
3,1,0,0,0,7,279,39,24,0,2,0,1
4,0,0,0,1,7,289,33,30,0,2,1,0


### Selecting the Inputs for the Regression

In [106]:
data_with_targets.shape

(700, 12)

In [107]:
data_with_targets.iloc[:, :14]

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Transportation Expense,Age,Body Mass Index,Education,Children,Pet,Excessive Absenteeism
0,0,0,0,1,7,289,33,30,0,2,1,1
1,0,0,0,0,7,118,50,31,0,1,0,0
2,0,0,0,1,7,179,38,31,0,0,0,0
3,1,0,0,0,7,279,39,24,0,2,0,1
4,0,0,0,1,7,289,33,30,0,2,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,5,179,40,22,1,2,0,1
696,1,0,0,0,5,225,28,24,0,1,2,0
697,1,0,0,0,5,330,28,25,1,0,0,1
698,0,0,0,1,5,235,32,25,1,0,0,0


In [108]:
data_with_targets.iloc[:, :-1]

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Transportation Expense,Age,Body Mass Index,Education,Children,Pet
0,0,0,0,1,7,289,33,30,0,2,1
1,0,0,0,0,7,118,50,31,0,1,0
2,0,0,0,1,7,179,38,31,0,0,0
3,1,0,0,0,7,279,39,24,0,2,0
4,0,0,0,1,7,289,33,30,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,5,179,40,22,1,2,0
696,1,0,0,0,5,225,28,24,0,1,2
697,1,0,0,0,5,330,28,25,1,0,0
698,0,0,0,1,5,235,32,25,1,0,0


In [109]:
unscaled_inputs = data_with_targets.iloc[:, :-1]

In [110]:
unscaled_inputs.columns.values

array(['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Month Value',
       'Transportation Expense', 'Age', 'Body Mass Index', 'Education',
       'Children', 'Pet'], dtype=object)

In [111]:
columns_to_scale = ['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Month Value', 'Transportation Expense', 'Age', 'Body Mass Index', 'Education', 'Children', 'Pet']

In [112]:
absenteeism_scaler = CustomScaler(columns_to_scale)

In [113]:
absenteeism_scaler.fit(unscaled_inputs)

  return var(axis=axis, dtype=dtype, out=out, ddof=ddof, **kwargs)


In [114]:
scaled_inputs = absenteeism_scaler.transform(unscaled_inputs)

In [115]:
scaled_inputs

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Transportation Expense,Age,Body Mass Index,Education,Children,Pet
0,-0.577350,-0.092981,-0.314485,0.821365,0.030796,1.005844,-0.536062,0.767431,-0.447980,0.880469,0.268487
1,-0.577350,-0.092981,-0.314485,-1.217485,0.030796,-1.574681,2.130803,1.002633,-0.447980,-0.019280,-0.589690
2,-0.577350,-0.092981,-0.314485,0.821365,0.030796,-0.654143,0.248310,1.002633,-0.447980,-0.919030,-0.589690
3,1.732051,-0.092981,-0.314485,-1.217485,0.030796,0.854936,0.405184,-0.643782,-0.447980,0.880469,-0.589690
4,-0.577350,-0.092981,-0.314485,0.821365,0.030796,1.005844,-0.536062,0.767431,-0.447980,0.880469,0.268487
...,...,...,...,...,...,...,...,...,...,...,...
695,1.732051,-0.092981,-0.314485,-1.217485,-0.568019,-0.654143,0.562059,-1.114186,2.232242,0.880469,-0.589690
696,1.732051,-0.092981,-0.314485,-1.217485,-0.568019,0.040034,-1.320435,-0.643782,-0.447980,-0.019280,1.126663
697,1.732051,-0.092981,-0.314485,-1.217485,-0.568019,1.624567,-1.320435,-0.408580,2.232242,-0.919030,-0.589690
698,-0.577350,-0.092981,-0.314485,0.821365,-0.568019,0.190942,-0.692937,-0.408580,2.232242,-0.919030,-0.589690


### Split

In [116]:
train_test_split(scaled_inputs, targets)

[     Reason_1  Reason_2  Reason_3  Reason_4  Month Value  \
 89   1.732051 -0.092981 -0.314485 -1.217485     1.228426   
 516 -0.577350 -0.092981 -0.314485  0.821365     0.929019   
 27  -0.577350 -0.092981 -0.314485  0.821365     1.527833   
 566 -0.577350 -0.092981 -0.314485  0.821365     1.527833   
 236 -0.577350 -0.092981  3.179797 -1.217485    -0.568019   
 ..        ...       ...       ...       ...          ...   
 568 -0.577350 -0.092981 -0.314485  0.821365     1.527833   
 563 -0.577350 -0.092981 -0.314485  0.821365     1.527833   
 411 -0.577350 -0.092981 -0.314485  0.821365    -1.166834   
 350 -0.577350 -0.092981 -0.314485  0.821365     1.527833   
 402 -0.577350 -0.092981 -0.314485  0.821365    -1.166834   
 
      Transportation Expense       Age  Body Mass Index  Education  Children  \
 89                -1.016322 -0.379188        -0.408580  -0.447980  0.880469   
 516                2.213108 -0.849811        -0.408580  -0.447980  1.780219   
 27                -0.9861

In [117]:
X_train, X_test, y_train, y_test = train_test_split(scaled_inputs, targets)

In [118]:
print(X_train.shape, y_train.shape)

(525, 11) (525,)


In [119]:
print(X_test.shape, y_test.shape)

(175, 11) (175,)


In [120]:
X_train, X_test, y_train, y_test = train_test_split(scaled_inputs, targets, train_size=0.8, random_state=20)

In [121]:
print(X_train.shape, y_train.shape)

(560, 11) (560,)


In [122]:
print(X_test.shape, y_test.shape)

(140, 11) (140,)


### Logistic Regression with sklearn

In [123]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score

### Training the model

In [124]:
reg = LogisticRegression()

In [125]:
reg.fit(X_train, y_train)

In [126]:
reg.score(X_train, y_train)

0.7857142857142857

### Manually check the accuracy

In [127]:
model_outputs = reg.predict(X_train)
model_outputs

array([0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1,
       1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0,
       0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0,
       1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1,
       1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1,
       0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0,

In [128]:
y_train

array([0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0,
       1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1,
       1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1,
       1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1,
       1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0,

In [129]:
model_outputs == y_train

array([ True,  True, False,  True,  True,  True,  True,  True,  True,
        True, False,  True, False, False,  True,  True,  True,  True,
       False,  True, False,  True, False, False,  True,  True,  True,
       False,  True,  True,  True,  True,  True,  True,  True,  True,
       False, False, False, False,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True, False,  True,  True,  True,
        True,  True,  True,  True,  True, False,  True,  True,  True,
        True,  True,  True,  True, False,  True, False,  True,  True,
        True,  True,  True, False,  True,  True,  True,  True,  True,
       False,  True, False,  True,  True, False, False, False,  True,
        True,  True,  True,  True,  True,  True,  True, False,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True, False,  True,  True,  True,  True,
        True,  True,  True,  True,  True, False,  True,  True,  True,
        True,  True,

In [130]:
np.sum((model_outputs == y_train))

440

In [131]:
model_outputs.shape[0]

560

In [132]:
np.sum((model_outputs == y_train)) / model_outputs.shape[0]

0.7857142857142857

### Extracting the Intercept and Coefficients

In [133]:
reg.intercept_

array([-0.21805902])

In [134]:
reg.coef_

array([[ 2.05375976,  0.32940745,  1.55175598,  1.295456  ,  0.01975052,
         0.71290575, -0.20310607,  0.33288059, -0.1389948 ,  0.37910854,
        -0.31894076]])

In [135]:
scaled_inputs.columns.values

array(['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Month Value',
       'Transportation Expense', 'Age', 'Body Mass Index', 'Education',
       'Children', 'Pet'], dtype=object)

In [136]:
unscaled_inputs.columns.values

array(['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Month Value',
       'Transportation Expense', 'Age', 'Body Mass Index', 'Education',
       'Children', 'Pet'], dtype=object)

In [137]:
feature_names = unscaled_inputs.columns.values

In [138]:
summary_table = pd.DataFrame(columns=['Feature Name'], data = feature_names)

summary_table['Coefficient'] = np.transpose(reg.coef_)

summary_table

Unnamed: 0,Feature Name,Coefficient
0,Reason_1,2.05376
1,Reason_2,0.329407
2,Reason_3,1.551756
3,Reason_4,1.295456
4,Month Value,0.019751
5,Transportation Expense,0.712906
6,Age,-0.203106
7,Body Mass Index,0.332881
8,Education,-0.138995
9,Children,0.379109


In [139]:
summary_table.index = summary_table.index + 1
summary_table.loc[0] = ['Intercept', reg.intercept_[0]]
summary_table = summary_table.sort_index()
summary_table

Unnamed: 0,Feature Name,Coefficient
0,Intercept,-0.218059
1,Reason_1,2.05376
2,Reason_2,0.329407
3,Reason_3,1.551756
4,Reason_4,1.295456
5,Month Value,0.019751
6,Transportation Expense,0.712906
7,Age,-0.203106
8,Body Mass Index,0.332881
9,Education,-0.138995


### Interpreting the Coefficients

In [140]:
summary_table['Odds_ratio'] = np.exp(summary_table.Coefficient)

In [141]:
summary_table

Unnamed: 0,Feature Name,Coefficient,Odds_ratio
0,Intercept,-0.218059,0.804078
1,Reason_1,2.05376,7.797162
2,Reason_2,0.329407,1.390144
3,Reason_3,1.551756,4.719751
4,Reason_4,1.295456,3.652661
5,Month Value,0.019751,1.019947
6,Transportation Expense,0.712906,2.03991
7,Age,-0.203106,0.816192
8,Body Mass Index,0.332881,1.394981
9,Education,-0.138995,0.870233


In [142]:
summary_table.sort_values('Odds_ratio', ascending=False)

Unnamed: 0,Feature Name,Coefficient,Odds_ratio
1,Reason_1,2.05376,7.797162
3,Reason_3,1.551756,4.719751
4,Reason_4,1.295456,3.652661
6,Transportation Expense,0.712906,2.03991
10,Children,0.379109,1.460982
8,Body Mass Index,0.332881,1.394981
2,Reason_2,0.329407,1.390144
5,Month Value,0.019751,1.019947
9,Education,-0.138995,0.870233
7,Age,-0.203106,0.816192


=================================================================================================================================================

## Testing the Logistic Regression Model

In [143]:
reg.score(X_test, y_test)

0.7285714285714285

* `sklearn.linear_model.LogisticRegression.predict_proba(x)` returns the probability estimates for all possible outputs (classes)

In [144]:
predicted_proba = reg.predict_proba(X_test)
predicted_proba

array([[0.75419772, 0.24580228],
       [0.59585028, 0.40414972],
       [0.4459363 , 0.5540637 ],
       [0.76179054, 0.23820946],
       [0.06685395, 0.93314605],
       [0.28106341, 0.71893659],
       [0.29012732, 0.70987268],
       [0.07130711, 0.92869289],
       [0.7464439 , 0.2535561 ],
       [0.75747164, 0.24252836],
       [0.47933931, 0.52066069],
       [0.15452665, 0.84547335],
       [0.03565258, 0.96434742],
       [0.72907287, 0.27092713],
       [0.22721519, 0.77278481],
       [0.50950968, 0.49049032],
       [0.47786365, 0.52213635],
       [0.48524527, 0.51475473],
       [0.36570263, 0.63429737],
       [0.0340704 , 0.9659296 ],
       [0.74194102, 0.25805898],
       [0.76179054, 0.23820946],
       [0.47413086, 0.52586914],
       [0.46823712, 0.53176288],
       [0.15662496, 0.84337504],
       [0.74756148, 0.25243852],
       [0.49018941, 0.50981059],
       [0.8993592 , 0.1006408 ],
       [0.16217057, 0.83782943],
       [0.76179054, 0.23820946],
       [0.

In [145]:
predicted_proba.shape

(140, 2)

In [None]:
# Probabilities of Absenteeism

predicted_proba[:, 1]

array([0.24580228, 0.40414972, 0.5540637 , 0.23820946, 0.93314605,
       0.71893659, 0.70987268, 0.92869289, 0.2535561 , 0.24252836,
       0.52066069, 0.84547335, 0.96434742, 0.27092713, 0.77278481,
       0.49049032, 0.52213635, 0.51475473, 0.63429737, 0.9659296 ,
       0.25805898, 0.23820946, 0.52586914, 0.53176288, 0.84337504,
       0.25243852, 0.50981059, 0.1006408 , 0.83782943, 0.23820946,
       0.39846673, 0.73072911, 0.72722468, 0.52361162, 0.23820946,
       0.63729175, 0.25467694, 0.84709916, 0.45542601, 0.63049413,
       0.23713803, 0.47201318, 0.25021316, 0.10842763, 0.83589641,
       0.68630226, 0.73514437, 0.24036228, 0.2455922 , 0.23606993,
       0.48381645, 0.06564283, 0.71893659, 0.24015537, 0.847291  ,
       0.40700098, 0.94692247, 0.25223161, 0.08047104, 0.08090969,
       0.71143335, 0.72250736, 0.25559231, 0.84724115, 0.23446034,
       0.24470767, 0.01192714, 0.25580104, 0.83847784, 0.2756246 ,
       0.24800092, 0.07771159, 0.91090465, 0.45249436, 0.63180

* In reality, Logistic Regression models calculate these probabilities in the background.

* If the probability is:
    * below `0.5`, it places as `0`
    * above `0.5`, it places as `1`

=================================================================================================================================================

## Saving the Logistic Regression Model

* `pickle` is a Python module used to convert a Python object into a character stream

In [147]:
import pickle

In [148]:
with open('model', 'wb') as file:
    pickle.dump(reg, file)

In [149]:
with open('scaler', 'wb') as file:
    pickle.dump(absenteeism_scaler, file)

<hr>