In [1]:
#import the necessary variables
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
 

In [2]:
#set display options
pd.options.display.max_rows=10

In [3]:
#load the preprocessed data
data_preprocessed = pd.read_csv('04.Absenteeism_preprocessed.csv')

In [4]:
data_preprocessed

Unnamed: 0.1,Unnamed: 0,reason_1,reason_2,reason_3,reason_4,months,Days,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,0,0,1,0,0,5,3,225,15,41,237.656,28,1,2,2,8
1,1,0,0,0,1,3,3,225,26,28,222.196,24,0,1,2,2
2,2,0,0,0,1,11,3,179,51,38,284.031,31,0,0,0,3
3,3,0,0,0,1,3,1,228,14,58,222.196,22,0,2,1,3
4,4,0,0,0,1,3,2,361,52,28,244.387,27,0,1,4,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,695,1,0,0,0,2,0,388,15,50,302.585,24,0,0,0,8
696,696,0,0,0,1,3,4,179,51,38,222.196,31,0,0,0,3
697,697,0,0,0,1,10,2,179,51,38,253.465,31,0,0,0,8
698,698,0,0,0,1,6,3,225,26,28,377.550,24,0,1,2,2


In [5]:
data_preprocessed['Absenteeism Time in Hours'].describe()

count    700.000000
mean       6.761429
std       12.670082
min        0.000000
25%        2.000000
50%        3.000000
75%        8.000000
max      120.000000
Name: Absenteeism Time in Hours, dtype: float64

In [6]:
#Create checkpoint
data_output_categorized = data_preprocessed.copy()

In [7]:
#categorize the outputs
data_output_categorized['targets']= data_output_categorized['Absenteeism Time in Hours'].map(lambda x: 0 if x<=np.median(data_output_categorized['Absenteeism Time in Hours']) else 1)

In [8]:
#drop the absenteesim column
data = data_output_categorized.drop(['Unnamed: 0', 'Absenteeism Time in Hours'], axis=1)

In [9]:
data

Unnamed: 0,reason_1,reason_2,reason_3,reason_4,months,Days,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,targets
0,0,1,0,0,5,3,225,15,41,237.656,28,1,2,2,1
1,0,0,0,1,3,3,225,26,28,222.196,24,0,1,2,0
2,0,0,0,1,11,3,179,51,38,284.031,31,0,0,0,0
3,0,0,0,1,3,1,228,14,58,222.196,22,0,2,1,0
4,0,0,0,1,3,2,361,52,28,244.387,27,0,1,4,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,2,0,388,15,50,302.585,24,0,0,0,1
696,0,0,0,1,3,4,179,51,38,222.196,31,0,0,0,0
697,0,0,0,1,10,2,179,51,38,253.465,31,0,0,0,1
698,0,0,0,1,6,3,225,26,28,377.550,24,0,1,2,0


### Split the inputs and the targets

In [10]:
#set aside the inputs
inputs = data.iloc[:,:-1]

#set aside the outputs
targets = data.iloc[:,-1]

### Scale the inputs

In [11]:
#initiate a standardscaler instance
scaler = StandardScaler()

#train the model
scaler.fit(inputs)

#perform the scaling and save in a variable
inputs_scaled = scaler.transform(inputs)

### split the  the train and test data

In [12]:
#split the data into train and test
train_x, test_x, train_y, test_y = train_test_split(inputs_scaled, targets, test_size=0.2, random_state = 1)

In [13]:
#check the variable dimensions
train_x.shape, test_x.shape, train_y.shape, test_y.shape

((560, 14), (140, 14), (560,), (140,))

### Perform Logistic Regression

In [14]:
#initiate a Logistic Regression instance
reg_log = LogisticRegression()

#train the model
reg_log.fit(train_x, train_y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [15]:
#print the probabilitiy of excessive absenteeism: first column is 0, second column is 1(excessive absent)
reg_log.predict_proba(test_x)

array([[0.72918273, 0.27081727],
       [0.78550531, 0.21449469],
       [0.62434147, 0.37565853],
       [0.4815518 , 0.5184482 ],
       [0.64088431, 0.35911569],
       [0.86921769, 0.13078231],
       [0.86039548, 0.13960452],
       [0.59635915, 0.40364085],
       [0.40812685, 0.59187315],
       [0.56726371, 0.43273629],
       [0.80818703, 0.19181297],
       [0.53807278, 0.46192722],
       [0.70652012, 0.29347988],
       [0.64906591, 0.35093409],
       [0.78660372, 0.21339628],
       [0.37813153, 0.62186847],
       [0.66427913, 0.33572087],
       [0.71251694, 0.28748306],
       [0.04817205, 0.95182795],
       [0.83856117, 0.16143883],
       [0.24741697, 0.75258303],
       [0.22576629, 0.77423371],
       [0.7925983 , 0.2074017 ],
       [0.63421657, 0.36578343],
       [0.84396928, 0.15603072],
       [0.77293776, 0.22706224],
       [0.80126715, 0.19873285],
       [0.08884047, 0.91115953],
       [0.60340808, 0.39659192],
       [0.5618319 , 0.4381681 ],
       [0.

In [16]:
#evaluate the result
reg_log.score(train_x, train_y)

0.7392857142857143

## Manually check the result

In [17]:
#create a summary table
summary = pd.DataFrame()
summary['test_y'] = test_y
summary['predicted_y'] = reg_log.predict(test_x)
summary['result'] = (summary['test_y'] == summary['predicted_y'])

In [18]:
summary.head(2)

Unnamed: 0,test_y,predicted_y,result
681,1,0,False
626,1,0,False


In [19]:
#check the stats
summary['result'].describe(include='all')

count      140
unique       2
top       True
freq        90
Name: result, dtype: object

In [20]:
#calculate the accuracy of the predictions
accuracy = summary['result'].describe().freq / summary['result'].count() * 100
print('The accuracy of this model for the test data is {: .1f}% . '.format(accuracy))

The accuracy of this model for the test data is  64.3% . 


## Check the accuracy of the training

In [21]:
summary_train = pd.DataFrame()
summary_train['train_y'] = train_y
summary_train['predict_y'] = reg_log.predict(train_x)
summary_train['result'] = (summary_train['train_y']==summary_train['predict_y'])

In [22]:
summary_train['result'].describe()

count      560
unique       2
top       True
freq       414
Name: result, dtype: object

In [23]:
#calculate the accuracy of the predictions
accuracy = summary_train['result'].describe().freq / summary_train['result'].count() * 100
print('The accuracy of this model for the training data is {: .1f}% . '.format(accuracy))

The accuracy of this model for the training data is  73.9% . 


## Create a summary table with coefficients

In [24]:
summary_table = pd.DataFrame()

In [25]:
summary_table['features'] = inputs.columns
summary_table['Weights'] = reg_log.coef_.reshape(-1,1)

In [26]:
summary_table.index = summary_table.index+1

In [27]:
summary_table.loc[0] = ['intercept', float(reg_log.intercept_)]

In [28]:
summary_table['odds ratio'] = np.exp(summary_table.Weights)

In [29]:
summary_table = summary_table.sort_index()

In [30]:
summary_table

Unnamed: 0,features,Weights,odds ratio
0,intercept,-0.234264,0.791153
1,reason_1,0.148719,1.160346
2,reason_2,-0.033580,0.966977
3,reason_3,0.557793,1.746814
4,reason_4,-0.451503,0.636671
...,...,...,...
10,Daily Work Load Average,0.056678,1.058315
11,Body Mass Index,0.235694,1.265787
12,Education,-0.059259,0.942462
13,Children,0.337266,1.401112
