In [131]:
import pandas as pd 
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn import metrics


In [132]:
data_preprocessed = pd.read_csv('Absenteeism_preprocessed.csv')

In [133]:
data_preprocessed.head(5)

Unnamed: 0,reason_1,reason_2,reason_3,reason_4,Month_Value,Weekday_Value,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,2
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,4
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,2


In [134]:
data_preprocessed['Absenteeism Time in Hours'].median()

3.0

In [135]:
targets = np.where(data_preprocessed['Absenteeism Time in Hours'] > data_preprocessed['Absenteeism Time in Hours'].median(), 1, 0)
data_preprocessed['Excessive Absenteism'] = targets

In [136]:
data_preprocessed

Unnamed: 0,reason_1,reason_2,reason_3,reason_4,Month_Value,Weekday_Value,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Excessive Absenteism
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4,1
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,2,0
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,4,1
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,5,2,179,22,40,237.656,22,1,2,0,8,1
696,1,0,0,0,5,2,225,26,28,237.656,24,0,1,2,3,0
697,1,0,0,0,5,3,330,16,28,237.656,25,1,0,0,8,1
698,0,0,0,1,5,3,235,16,32,237.656,25,1,0,0,2,0


In [137]:
targets.sum() /  targets.shape[0]

0.45571428571428574

In [138]:
data_with_targets = data_preprocessed.drop(['Absenteeism Time in Hours'], axis=1)

### inputs

In [139]:
data_with_targets.shape

(700, 15)

In [140]:
unscaled_inputs = data_with_targets.iloc[:, :-1]

In [141]:
absenteism_scaler = StandardScaler()
# absenteism_scaler.fit(unscaled_inputs)
# scaled_inputs = absenteism_scaler.transform(unscaled_inputs)

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler

# create the Custom Scaler class

class CustomScaler(BaseEstimator,TransformerMixin): 
    
    # init or what information we need to declare a CustomScaler object
    # and what is calculated/declared as we do
    
    def __init__(self,columns):
        
        # scaler is nothing but a Standard Scaler object
        self.scaler = StandardScaler()
        # with some columns 'twist'
        self.columns = columns
        self.mean_ = None
        self.var_ = None
        
    
    # the fit method, which, again based on StandardScale
    
    def fit(self, X, y=None):
        self.scaler.fit(X[self.columns], y)
        self.mean_ = np.mean(X[self.columns])
        self.var_ = np.var(X[self.columns])
        return self
    
    # the transform method which does the actual scaling

    def transform(self, X, y=None, copy=None):
        
        # record the initial order of the columns
        init_col_order = X.columns
        
        # scale all features that you chose when creating the instance of the class
        X_scaled = pd.DataFrame(self.scaler.transform(X[self.columns]), columns=self.columns)
        
        # declare a variable containing all information that was not scaled
        X_not_scaled = X.loc[:,~X.columns.isin(self.columns)]
        
        # return a data frame which contains all scaled features and all 'not scaled' features
        # use the original order (that you recorded in the beginning)
        return pd.concat([X_not_scaled, X_scaled], axis=1)[init_col_order]


In [142]:
columns_to_omit = ['reason_1', 'reason_2', 'reason_3', 'reason_4','Education']

columns_to_scale = [i for i in unscaled_inputs.columns.values if i not in columns_to_omit]

columns_to_scale

['Month_Value',
 'Weekday_Value',
 'Transportation Expense',
 'Distance to Work',
 'Age',
 'Daily Work Load Average',
 'Body Mass Index',
 'Children',
 'Pets']

In [143]:
absenteeism_scaler = CustomScaler(columns_to_scale)


In [144]:
absenteeism_scaler.fit(unscaled_inputs)

CustomScaler(columns=['Month_Value', 'Weekday_Value', 'Transportation Expense',
                      'Distance to Work', 'Age', 'Daily Work Load Average',
                      'Body Mass Index', 'Children', 'Pets'])

In [145]:
scaled_inputs = absenteeism_scaler.transform(unscaled_inputs)
scaled_inputs

Unnamed: 0,reason_1,reason_2,reason_3,reason_4,Month_Value,Weekday_Value,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets
0,0,0,0,1,0.182726,-0.683704,1.005844,0.412816,-0.536062,-0.806331,0.767431,0,0.880469,0.268487
1,0,0,0,0,0.182726,-0.683704,-1.574681,-1.141882,2.130803,-0.806331,1.002633,0,-0.019280,-0.589690
2,0,0,0,1,0.182726,-0.007725,-0.654143,1.426749,0.248310,-0.806331,1.002633,0,-0.919030,-0.589690
3,1,0,0,0,0.182726,0.668253,0.854936,-1.682647,0.405184,-0.806331,-0.643782,0,0.880469,-0.589690
4,0,0,0,1,0.182726,0.668253,1.005844,0.412816,-0.536062,-0.806331,0.767431,0,0.880469,0.268487
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,-0.388293,-0.007725,-0.654143,-0.533522,0.562059,-0.853789,-1.114186,1,0.880469,-0.589690
696,1,0,0,0,-0.388293,-0.007725,0.040034,-0.263140,-1.320435,-0.853789,-0.643782,0,-0.019280,1.126663
697,1,0,0,0,-0.388293,0.668253,1.624567,-0.939096,-1.320435,-0.853789,-0.408580,1,-0.919030,-0.589690
698,0,0,0,1,-0.388293,0.668253,0.190942,-0.939096,-0.692937,-0.853789,-0.408580,1,-0.919030,-0.589690


In [146]:
scaled_inputs.shape

(700, 14)

### Split data and shuffle

In [147]:
x_train, x_test, y_train, y_test = train_test_split(scaled_inputs, targets, train_size=0.8, random_state=20)

In [148]:
print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

(560, 14) (140, 14) (560,) (140,)


In [149]:
reg = LogisticRegression()

In [150]:
reg.fit(x_train, y_train)

LogisticRegression()

In [151]:
#accuracy
reg.score(x_train, y_train)

0.775

### manually check accuracy

In [152]:
model_output = reg.predict(x_train)

np.sum(model_output == y_train) / (model_output == y_train).shape[0]


0.775

### Intercept and Coefficients

In [153]:
reg.intercept_

array([-1.6561092])

In [154]:
reg.coef_

array([[ 2.80096498e+00,  9.34857518e-01,  3.09561645e+00,
         8.56587468e-01,  1.66248119e-01, -8.43703301e-02,
         6.12732578e-01, -7.79685996e-03, -1.65922708e-01,
        -1.47005122e-04,  2.71811477e-01, -2.05738037e-01,
         3.61989880e-01, -2.85510745e-01]])

In [155]:
feature_name = unscaled_inputs.columns.values

summary_table = pd.DataFrame(columns=['feature_name'], data=feature_name)
summary_table['Coefficien'] = np.transpose(reg.coef_)

summary_table

Unnamed: 0,feature_name,Coefficien
0,reason_1,2.800965
1,reason_2,0.934858
2,reason_3,3.095616
3,reason_4,0.856587
4,Month_Value,0.166248
5,Weekday_Value,-0.08437
6,Transportation Expense,0.612733
7,Distance to Work,-0.007797
8,Age,-0.165923
9,Daily Work Load Average,-0.000147


In [156]:
summary_table.index = summary_table.index + 1
summary_table.loc[0] = ["Intercept", reg.intercept_[0]]

summary_table = summary_table.sort_index()

In [157]:
summary_table['Odds_ratio'] = np.exp(summary_table['Coefficien'])
summary_table.sort_values('Odds_ratio', ascending=False)

Unnamed: 0,feature_name,Coefficien,Odds_ratio
3,reason_3,3.095616,22.100858
1,reason_1,2.800965,16.460523
2,reason_2,0.934858,2.546851
4,reason_4,0.856587,2.35511
7,Transportation Expense,0.612733,1.845467
13,Children,0.36199,1.436184
11,Body Mass Index,0.271811,1.31234
5,Month_Value,0.166248,1.180866
10,Daily Work Load Average,-0.000147,0.999853
8,Distance to Work,-0.007797,0.992233
