Machine learning analysis for the preprocessed absenteeism data

In [1]:
import pandas as pd
import numpy as np

df_prep = pd.read_csv("Resources\Absenteeism_preprocessed_data.csv")

In [2]:
df_prep.head()

Unnamed: 0.1,Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month,Day of the week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4
1,1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0
2,2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,2
3,3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,4
4,4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,2


In [3]:
df_prep['Absenteeism Time in Hours'].median()

3.0

In [4]:
targets = np.where(df_prep['Absenteeism Time in Hours'] > df_prep['Absenteeism Time in Hours'].median(), 1, 0)

In [5]:
print(len(targets))
sum(targets)

700


319

In [6]:
df = df_prep.drop(['Absenteeism Time in Hours'], axis = 1)

In [7]:
df['Excessive Absenteeism'] = targets
df.head()

Unnamed: 0.1,Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month,Day of the week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Excessive Absenteeism
0,0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,1
1,1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0
2,2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,0
3,3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,1
4,4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,0


In [8]:
df = df.drop(['Unnamed: 0'], axis = 1)

In [9]:
df.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month,Day of the week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Excessive Absenteeism
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,1
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,0
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,1
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,0


In [10]:
df_with_targets = df.copy()

In [11]:
df_with_targets.shape

(700, 15)

In [12]:
unscaled_inputs = df_with_targets.iloc[:,:-1]

In [13]:
unscaled_inputs.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month,Day of the week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1


### Standardization

In [14]:
from sklearn.preprocessing import StandardScaler
absenteism_scaler = StandardScaler()

In [15]:
absenteism_scaler.fit(unscaled_inputs)

  return self.partial_fit(X, y)


StandardScaler(copy=True, with_mean=True, with_std=True)

In [16]:
scaled_inputs = absenteism_scaler.transform(unscaled_inputs)

  """Entry point for launching an IPython kernel.


In [17]:
scaled_inputs

array([[-0.57735027, -0.09298136, -0.31448545, ..., -0.44798003,
         0.88046927,  0.26848661],
       [-0.57735027, -0.09298136, -0.31448545, ..., -0.44798003,
        -0.01928035, -0.58968976],
       [-0.57735027, -0.09298136, -0.31448545, ..., -0.44798003,
        -0.91902997, -0.58968976],
       ...,
       [ 1.73205081, -0.09298136, -0.31448545, ...,  2.23224237,
        -0.91902997, -0.58968976],
       [-0.57735027, -0.09298136, -0.31448545, ...,  2.23224237,
        -0.91902997, -0.58968976],
       [-0.57735027, -0.09298136, -0.31448545, ..., -0.44798003,
        -0.01928035,  0.26848661]])

In [18]:
from sklearn.model_selection import train_test_split

In [19]:
X_train, X_test, y_train, y_test = train_test_split(scaled_inputs, targets, train_size = 0.8, random_state=42, shuffle = True)



In [20]:
len(X_train)

560

In [21]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

In [22]:
reg = LogisticRegression()

In [23]:
reg.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [24]:
reg.score(X_train, y_train)

0.7732142857142857

In [25]:
predictions = reg.predict(X_test)

In [26]:
reg.score(X_test, y_test)

0.7571428571428571

In [27]:
predictions == y_test

array([False, False, False,  True, False,  True,  True,  True,  True,
        True,  True,  True,  True, False,  True,  True,  True, False,
        True,  True,  True,  True,  True, False,  True,  True,  True,
        True, False, False,  True,  True, False,  True,  True, False,
        True, False, False,  True, False, False, False,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True, False,  True,  True,  True,  True,
        True,  True,  True, False,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True, False, False, False, False, False,
        True, False,  True, False,  True,  True, False, False, False,
        True, False,

In [28]:
np.sum(predictions==y_test)

106

In [29]:
accuracy = np.sum(predictions==y_test) / len(predictions)
accuracy

0.7571428571428571

In [30]:
reg.intercept_

array([-0.17869474])

In [31]:
reg.coef_

array([[ 2.13760855,  0.34685155,  1.537381  ,  1.41116497,  0.09469695,
        -0.15167141,  0.79485964, -0.09792415, -0.28721165, -0.01117007,
         0.28412025, -0.14755158,  0.42284937, -0.36515323]])

In [32]:
feature_names = unscaled_inputs.columns.values

In [33]:
summary_table = pd.DataFrame(columns=['Feature name'], data = feature_names)
summary_table['Coefficients'] = np.transpose(reg.coef_)
summary_table

Unnamed: 0,Feature name,Coefficients
0,Reason_1,2.137609
1,Reason_2,0.346852
2,Reason_3,1.537381
3,Reason_4,1.411165
4,Month,0.094697
5,Day of the week,-0.151671
6,Transportation Expense,0.79486
7,Distance to Work,-0.097924
8,Age,-0.287212
9,Daily Work Load Average,-0.01117


In [34]:
unscaled_inputs.corr()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month,Day of the week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets
Reason_1,1.0,-0.053683,-0.181568,-0.702915,-0.002826,-0.078055,0.008725,-0.098625,0.022772,0.165434,-0.077597,0.095056,-0.061594,-0.054502
Reason_2,-0.053683,1.0,-0.029241,-0.113203,-0.018406,-0.011194,-0.000487,-0.022372,0.030381,-0.016664,-0.078084,0.124487,0.081867,-0.041531
Reason_3,-0.181568,-0.029241,1.0,-0.382881,-0.013796,-0.103661,0.016658,-0.023367,-0.019797,0.014542,-0.038087,0.046426,0.088256,0.105854
Reason_4,-0.702915,-0.113203,-0.382881,1.0,-0.036989,0.110696,-0.066727,0.130687,-0.062765,-0.163826,0.07116,-0.108243,-0.04968,-0.013062
Month,-0.002826,-0.018406,-0.013796,-0.036989,1.0,0.071718,0.135803,-0.00397,0.004983,-0.174029,0.058254,-0.075561,0.082057,0.07625
Day of the week,-0.078055,-0.011194,-0.103661,0.110696,0.071718,1.0,0.048516,0.089484,0.004039,0.012573,-0.07175,0.037951,0.111936,-0.045163
Transportation Expense,0.008725,-0.000487,0.016658,-0.066727,0.135803,0.048516,1.0,0.23494,-0.223828,0.006123,-0.140531,-0.019623,0.381749,0.446887
Distance to Work,-0.098625,-0.022372,-0.023367,0.130687,-0.00397,0.089484,0.23494,1.0,-0.131076,-0.073683,0.13619,-0.289994,0.048534,0.171585
Age,0.022772,0.030381,-0.019797,-0.062765,0.004983,0.004039,-0.223828,-0.131076,1.0,-0.045452,0.483762,-0.219723,0.04693,-0.252067
Daily Work Load Average,0.165434,-0.016664,0.014542,-0.163826,-0.174029,0.012573,0.006123,-0.073683,-0.045452,1.0,-0.09843,-0.05899,0.032194,0.01049


In [35]:
summary_table.index = summary_table.index + 1
summary_table.loc[0] = ['Intercept', reg.intercept_[0]]
summary_table = summary_table

In [36]:
summary_table = summary_table.sort_index()
summary_table

Unnamed: 0,Feature name,Coefficients
0,Intercept,-0.178695
1,Reason_1,2.137609
2,Reason_2,0.346852
3,Reason_3,1.537381
4,Reason_4,1.411165
5,Month,0.094697
6,Day of the week,-0.151671
7,Transportation Expense,0.79486
8,Distance to Work,-0.097924
9,Age,-0.287212


### Interpreting the coefficients ###

In [37]:
summary_table['Odds_ratio'] = np.exp(summary_table.Coefficients)

In [38]:
summary_table

Unnamed: 0,Feature name,Coefficients,Odds_ratio
0,Intercept,-0.178695,0.836361
1,Reason_1,2.137609,8.479136
2,Reason_2,0.346852,1.414607
3,Reason_3,1.537381,4.65239
4,Reason_4,1.411165,4.10073
5,Month,0.094697,1.099326
6,Day of the week,-0.151671,0.859271
7,Transportation Expense,0.79486,2.21413
8,Distance to Work,-0.097924,0.906718
9,Age,-0.287212,0.750353


In [39]:
summary_table.sort_values('Odds_ratio', ascending = False)

Unnamed: 0,Feature name,Coefficients,Odds_ratio
1,Reason_1,2.137609,8.479136
3,Reason_3,1.537381,4.65239
4,Reason_4,1.411165,4.10073
7,Transportation Expense,0.79486,2.21413
13,Children,0.422849,1.526304
2,Reason_2,0.346852,1.414607
11,Body Mass Index,0.28412,1.328593
5,Month,0.094697,1.099326
10,Daily Work Load Average,-0.01117,0.988892
8,Distance to Work,-0.097924,0.906718


In [50]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler

class CustomScaler(BaseEstimator, TransformerMixin):
    
    def __init__(self, columns, copy = True, with_mean = True, with_std = True):
        self.scaler = StandardScaler(copy, with_mean, with_std)
        self.columns = columns
        self.mean_ = None
        self.var_ = None
    
    def fit(self, X, y=None):
        self.scaler.fit(X[self.columns], y)
        self.mean_ = np.mean(X[self.columns])
        self.var_ = np.var(X[self.columns])
        return self
    
    def transform(self, X, y=None, copy=None):
        init_col_order = X.columns
        X_scaled = pd.DataFrame(self.scaler.transform(X[self.columns]), columns = self.columns)
        X_not_scaled = X.loc[:,~X.columns.isin(self.columns)]
        return pd.concat([X_not_scaled, X_scaled], axis = 1)[init_col_order]

In [42]:
unscaled_inputs.columns.values

array(['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Month',
       'Day of the week', 'Transportation Expense', 'Distance to Work',
       'Age', 'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets'], dtype=object)

In [43]:
columns_to_scale = ['Month',
       'Day of the week', 'Transportation Expense', 'Distance to Work',
       'Age', 'Daily Work Load Average', 'Body Mass Index', 'Children', 'Pets']

In [51]:
absenteism_scaler = CustomScaler(columns_to_scale)
absenteism_scaler.fit(unscaled_inputs)
scaled_inputs = absenteism_scaler.transform(unscaled_inputs)
scaled_inputs

  return self.partial_fit(X, y)


Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month,Day of the week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets
0,0,0,0,1,0.182726,-0.683704,1.005844,0.412816,-0.536062,-0.806331,0.767431,0,0.880469,0.268487
1,0,0,0,0,0.182726,-0.683704,-1.574681,-1.141882,2.130803,-0.806331,1.002633,0,-0.019280,-0.589690
2,0,0,0,1,0.182726,-0.007725,-0.654143,1.426749,0.248310,-0.806331,1.002633,0,-0.919030,-0.589690
3,1,0,0,0,0.182726,0.668253,0.854936,-1.682647,0.405184,-0.806331,-0.643782,0,0.880469,-0.589690
4,0,0,0,1,0.182726,0.668253,1.005844,0.412816,-0.536062,-0.806331,0.767431,0,0.880469,0.268487
5,0,0,0,1,0.182726,1.344231,-0.654143,1.426749,0.248310,-0.806331,1.002633,0,-0.919030,-0.589690
6,0,0,0,1,0.182726,1.344231,2.092381,1.494345,-1.320435,-0.806331,0.061825,0,-0.019280,2.843016
7,0,0,0,1,0.182726,1.344231,0.568211,1.359154,-0.065439,-0.806331,-0.878984,0,2.679969,-0.589690
8,0,0,1,0,0.182726,-1.359682,-1.016322,-1.209478,-0.379188,-0.806331,-0.408580,0,0.880469,-0.589690
9,0,0,0,1,0.182726,-1.359682,0.190942,-1.277074,0.091435,-0.806331,0.532229,1,-0.019280,0.268487


In [53]:
reg.fit(X_train, y_train)
print('Training score: ',reg.score(X_train, y_train))
predictions = reg.predict(X_test)
print('Testing score:', reg.score(X_test, y_test))
accuracy = np.sum(predictions==y_test) / len(predictions)
print( 'Accuracy:', accuracy)
print('Intercept:', reg.intercept_)
summary_table = pd.DataFrame(columns=['Feature name'], data = feature_names)
summary_table['Coefficients'] = np.transpose(reg.coef_)
summary_table

Training score:  0.7732142857142857
Testing score: 0.7571428571428571
Accuracy: 0.7571428571428571
Intercept: [-0.17869474]




Unnamed: 0,Feature name,Coefficients
0,Reason_1,2.137609
1,Reason_2,0.346852
2,Reason_3,1.537381
3,Reason_4,1.411165
4,Month,0.094697
5,Day of the week,-0.151671
6,Transportation Expense,0.79486
7,Distance to Work,-0.097924
8,Age,-0.287212
9,Daily Work Load Average,-0.01117


In [54]:
summary_table['Odds_ratio'] = np.exp(summary_table.Coefficients)
summary_table.sort_values('Odds_ratio', ascending = False)

Unnamed: 0,Feature name,Coefficients,Odds_ratio
0,Reason_1,2.137609,8.479136
2,Reason_3,1.537381,4.65239
3,Reason_4,1.411165,4.10073
6,Transportation Expense,0.79486,2.21413
12,Children,0.422849,1.526304
1,Reason_2,0.346852,1.414607
10,Body Mass Index,0.28412,1.328593
4,Month,0.094697,1.099326
9,Daily Work Load Average,-0.01117,0.988892
7,Distance to Work,-0.097924,0.906718
