In [1]:
#import the necessary variables
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
 

In [2]:
#set display options
pd.options.display.max_rows=15

In [3]:
#load the preprocessed data
data_preprocessed = pd.read_csv('04.Absenteeism_preprocessed.csv')

In [4]:
data_preprocessed

Unnamed: 0.1,Unnamed: 0,reason_1,reason_2,reason_3,reason_4,months,Days,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,0,0,1,0,0,5,3,225,15,41,237.656,28,1,2,2,8
1,1,0,0,0,1,3,3,225,26,28,222.196,24,0,1,2,2
2,2,0,0,0,1,11,3,179,51,38,284.031,31,0,0,0,3
3,3,0,0,0,1,3,1,228,14,58,222.196,22,0,2,1,3
4,4,0,0,0,1,3,2,361,52,28,244.387,27,0,1,4,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,695,1,0,0,0,2,0,388,15,50,302.585,24,0,0,0,8
696,696,0,0,0,1,3,4,179,51,38,222.196,31,0,0,0,3
697,697,0,0,0,1,10,2,179,51,38,253.465,31,0,0,0,8
698,698,0,0,0,1,6,3,225,26,28,377.550,24,0,1,2,2


In [5]:
data_preprocessed['Absenteeism Time in Hours'].describe()

count    700.000000
mean       6.761429
std       12.670082
min        0.000000
25%        2.000000
50%        3.000000
75%        8.000000
max      120.000000
Name: Absenteeism Time in Hours, dtype: float64

In [6]:
#Create checkpoint
data_output_categorized = data_preprocessed.copy()

In [7]:
#categorize the outputs
data_output_categorized['targets']= data_output_categorized['Absenteeism Time in Hours'].map(lambda x: 0 if x<=np.median(data_output_categorized['Absenteeism Time in Hours']) else 1)

In [8]:
#drop the absenteesim column
data = data_output_categorized.drop(['Unnamed: 0', 'Absenteeism Time in Hours'], axis=1)

In [9]:
data

Unnamed: 0,reason_1,reason_2,reason_3,reason_4,months,Days,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,targets
0,0,1,0,0,5,3,225,15,41,237.656,28,1,2,2,1
1,0,0,0,1,3,3,225,26,28,222.196,24,0,1,2,0
2,0,0,0,1,11,3,179,51,38,284.031,31,0,0,0,0
3,0,0,0,1,3,1,228,14,58,222.196,22,0,2,1,0
4,0,0,0,1,3,2,361,52,28,244.387,27,0,1,4,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,2,0,388,15,50,302.585,24,0,0,0,1
696,0,0,0,1,3,4,179,51,38,222.196,31,0,0,0,0
697,0,0,0,1,10,2,179,51,38,253.465,31,0,0,0,1
698,0,0,0,1,6,3,225,26,28,377.550,24,0,1,2,0


### Split the inputs and the targets

In [10]:
#set aside the inputs
inputs = data.iloc[:,:-1]

#set aside the outputs
targets = data.iloc[:,-1]

# BACKWARD ELIMINATION

In [11]:
#BACKWARD ELIMINATION
#After reviewing the summary table at the end 
#notice that ['Distance to Work', 'Daily Work Load Average', 'reason_2'] columns 
#have minimal effect for our model. To simplify the model they can be removed.
inputs.columns.values


array(['reason_1', 'reason_2', 'reason_3', 'reason_4', 'months', 'Days',
       'Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets'], dtype=object)

In [12]:
#reorganize the input columns
inputs = inputs[['reason_1', 'reason_3', 'reason_4', 'months', 'Days',
       'Transportation Expense', 'Age', 'Body Mass Index', 'Education',
       'Children', 'Pets']]

### Scale the inputs

In [13]:
#initiate a standardscaler instance
scaler = StandardScaler()

#train the model
scaler.fit(inputs)

#perform the scaling and save in a variable
inputs_scaled = scaler.transform(inputs)

### split the  the train and test data

In [14]:
#split the data into train and test
train_x, test_x, train_y, test_y = train_test_split(inputs_scaled, targets, test_size=0.2, random_state = 1)

In [15]:
#check the variable dimensions
train_x.shape, test_x.shape, train_y.shape, test_y.shape

((560, 11), (140, 11), (560,), (140,))

### Perform Logistic Regression

In [16]:
#initiate a Logistic Regression instance
reg_log = LogisticRegression()

#train the model
reg_log.fit(train_x, train_y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [17]:
#print the probabilitiy of excessive absenteeism: first column is 0, second column is 1(excessive absent)
reg_log.predict_proba(test_x)

array([[0.71286295, 0.28713705],
       [0.77115353, 0.22884647],
       [0.61554067, 0.38445933],
       [0.48888019, 0.51111981],
       [0.6602628 , 0.3397372 ],
       [0.86334704, 0.13665296],
       [0.85282054, 0.14717946],
       [0.59055923, 0.40944077],
       [0.38989107, 0.61010893],
       [0.56914857, 0.43085143],
       [0.80057007, 0.19942993],
       [0.55137117, 0.44862883],
       [0.72893214, 0.27106786],
       [0.68526495, 0.31473505],
       [0.7906853 , 0.2093147 ],
       [0.40266736, 0.59733264],
       [0.67362175, 0.32637825],
       [0.70689716, 0.29310284],
       [0.04825622, 0.95174378],
       [0.83182707, 0.16817293],
       [0.24173563, 0.75826437],
       [0.25829914, 0.74170086],
       [0.78496054, 0.21503946],
       [0.6372266 , 0.3627734 ],
       [0.8379818 , 0.1620182 ],
       [0.76982076, 0.23017924],
       [0.80280903, 0.19719097],
       [0.09323429, 0.90676571],
       [0.61199239, 0.38800761],
       [0.53811107, 0.46188893],
       [0.

In [18]:
#evaluate the result
reg_log.score(train_x, train_y)

0.7339285714285714

## Manually check the result

In [19]:
#create a summary table
summary = pd.DataFrame()
summary['test_y'] = test_y
summary['predicted_y'] = reg_log.predict(test_x)
summary['result'] = (summary['test_y'] == summary['predicted_y'])

In [20]:
summary.head(2)

Unnamed: 0,test_y,predicted_y,result
681,1,0,False
626,1,0,False


In [21]:
#check the stats
summary['result'].describe(include='all')

count      140
unique       2
top       True
freq        92
Name: result, dtype: object

In [22]:
#calculate the accuracy of the predictions
accuracy = summary['result'].describe().freq / summary['result'].count() * 100
print('The accuracy of this model for the test data is {: .1f}% . '.format(accuracy))

The accuracy of this model for the test data is  65.7% . 


## Check the accuracy of the training

In [23]:
summary_train = pd.DataFrame()
summary_train['train_y'] = train_y
summary_train['predict_y'] = reg_log.predict(train_x)
summary_train['result'] = (summary_train['train_y']==summary_train['predict_y'])

In [24]:
summary_train['result'].describe()

count      560
unique       2
top       True
freq       411
Name: result, dtype: object

In [25]:
#calculate the accuracy of the predictions
accuracy = summary_train['result'].describe().freq / summary_train['result'].count() * 100
print('The accuracy of this model for the training data is {: .1f}% . '.format(accuracy))

The accuracy of this model for the training data is  73.4% . 


## Create a summary table with coefficients

In [26]:
summary_table = pd.DataFrame()

In [27]:
summary_table['features'] = inputs.columns
summary_table['Weights'] = reg_log.coef_.reshape(-1,1)

In [28]:
summary_table.index = summary_table.index+1

In [29]:
summary_table.loc[0] = ['intercept', float(reg_log.intercept_)]

In [30]:
summary_table['odds ratio'] = np.exp(summary_table.Weights)

In [31]:
summary_table = summary_table.sort_index()

In [32]:
summary_table

Unnamed: 0,features,Weights,odds ratio
0,intercept,-0.235479,0.790192
1,reason_1,0.223539,1.250494
2,reason_3,0.60069,1.823377
3,reason_4,-0.383279,0.681623
4,months,0.091403,1.09571
5,Days,-0.245276,0.782488
6,Transportation Expense,0.580594,1.787099
7,Age,-0.26476,0.76739
8,Body Mass Index,0.224301,1.251447
9,Education,-0.057364,0.94425


### ['Distance to Work', 'Daily Work Load Average', 'reason_2'] columns have minimal effect for our model. To simplify the model they can be removed.
Prior to Backward Elimination:
The accuracy of this model for the training data is  73.9% . 
The accuracy of this model for the test data is  64.3% . 

## After the Backward Elimination:

The accuracy of this model for the training data is  73.4% . 
The accuracy of this model for the test data is  65.7% . 

### So, removing the variables had negligible impact. 