# Applying ML to Create the "absenteeism_module"

use ``logistic regression`` (type of classification) which will take the reason for absence, month of the year, day of the week, and the remaining data to predict employees' absenteeism.

### Load the data, create targets and select inputs

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
import pickle

data_preprocessed = pd.read_csv('0absenteeism_preprocessed.csv')
data_preprocessed.head()

# 2 classes: moderately absent, excessively absent. based on the "absenteeism time in hours" column: if above median, excessively absent; if below, moderately absent

#* create the targets
median = data_preprocessed['Absenteeism Time in Hours'].median()
targets = np.where(data_preprocessed['Absenteeism Time in Hours'] > median, 1, 0)

data_preprocessed['Excessive Absenteeism'] = targets
data_preprocessed.head()

targets.sum() / targets.shape[0] # to see the proportion of excessive absenteeism, it should be around 0.45 - 0.55 for a balanced data

#* drop the absenteeism time in hours column since it is no more needed
data_with_targets = data_preprocessed.drop(columns=['Absenteeism Time in Hours', "Day of the Week", "Daily Work Load Average", 'Distance to Work'], axis=1)
data_with_targets.head()

#* select the inputs (iloc to select by position in a df iloc[row_indices, column_indices]) = we will remove the targets from the df
unscaled_inputs = data_with_targets.iloc[:, :-1]
unscaled_inputs.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Transportation Expense,Age,Body Mass Index,Education,Children,Pets
0,False,False,False,True,7,289,33,30,0,2,1
1,False,False,False,False,7,118,50,31,0,1,0
2,False,False,False,True,7,179,38,31,0,0,0
3,True,False,False,False,7,279,39,24,0,2,0
4,False,False,False,True,7,289,33,30,0,2,1


### Standardize the Data
subtract the mean and divide by the standard deviation variablewise

In [2]:
#absenteeism_scaler = StandardScaler()
#! we will use a custom sclaler instead
class CustomScaler(BaseEstimator, TransformerMixin): #! to not standardize all inputs, but only the ones we choose (not to standardize dummy variables)
    def __init__(self, columns, copy=True, with_mean=True, with_std=True):
        self.scaler = StandardScaler(copy=copy, with_mean=with_mean, with_std=with_std)
        self.columns = columns
        self.mean_ = None
        self.var_ = None

    def fit(self, X, y=None):
        self.scaler.fit(X[self.columns], y)
        self.mean_ = np.mean(X[self.columns])
        self.var_ = np.var(X[self.columns])
        return self
    
    def transform(self, X, y=None, copy=None):
        init_col_order = X.columns
        X_scaled = pd.DataFrame(self.scaler.transform(X[self.columns]), columns=self.columns)
        X_not_scaled = X.loc[:, ~X.columns.isin(self.columns)]
        return pd.concat([X_not_scaled, X_scaled], axis=1)[init_col_order]

unscaled_inputs.columns.values
columns_to_omit = ['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4','Education']
columns_to_scale = [x for x in unscaled_inputs.columns.values if x not in columns_to_omit]

absenteeism_scaler = CustomScaler(columns_to_scale)
absenteeism_scaler.fit(unscaled_inputs)

scaled_inputs = absenteeism_scaler.transform(unscaled_inputs)
scaled_inputs.shape

  return var(axis=axis, dtype=dtype, out=out, ddof=ddof, **kwargs)


(700, 11)

### Split the Data for Training and Testing

In [3]:
x_train, x_test, y_train, y_test = train_test_split(scaled_inputs, targets, test_size=0.2, random_state=20) # random_state, to make shuffling in a psuedorandom way, not to get a different result for every training 
print(x_train.shape, y_train.shape)
print(x_test.shape, y_test.shape)

(560, 11) (560,)
(140, 11) (140,)


### Finally, the Modeling Part

In [4]:
reg = LogisticRegression()
reg.fit(x_train, y_train)

reg.score(x_train, y_train) # training accuracy, gives 0.78 = 78% accuracy = 78+ of the outputs match the target

#* Manually check the accuracy
model_outputs = reg.predict(x_train)
model_outputs == y_train # to see which outputs match the targets, gives a boolean array
np.sum(model_outputs == y_train) / model_outputs.shape[0] # gives the accuracy, 0.78 = 78% accuracy

print(metrics.confusion_matrix(y_train, model_outputs))
print(metrics.classification_report(y_train, model_outputs))

reg.intercept_
reg.coef_
feature_name = unscaled_inputs.columns.values
summary_table = pd.DataFrame(columns=['Feature Name'], data = feature_name)
summary_table['Coefficient'] = np.transpose(reg.coef_) # coefficients = weights
summary_table

summary_table.index = summary_table.index + 1 # to start the index from 1
summary_table.loc[0] = ['Intercept', reg.intercept_[0]] # intercepts = biases
summary_table = summary_table.sort_index() # to sort the index
summary_table

[[251  56]
 [ 71 182]]
              precision    recall  f1-score   support

           0       0.78      0.82      0.80       307
           1       0.76      0.72      0.74       253

    accuracy                           0.77       560
   macro avg       0.77      0.77      0.77       560
weighted avg       0.77      0.77      0.77       560



Unnamed: 0,Feature Name,Coefficient
0,Intercept,-1.64699
1,Reason_1,2.800006
2,Reason_2,0.951748
3,Reason_3,3.114061
4,Reason_4,0.838359
5,Month Value,0.158977
6,Transportation Expense,0.605137
7,Age,-0.169906
8,Body Mass Index,0.279982
9,Education,-0.210174


### Interpreting the Coefficients

In [6]:
summary_table['Odds Ratio'] = np.exp(summary_table['Coefficient']) # to see the odds ratio, which is the exponent of the coefficient

summary_table.sort_values(by='Odds Ratio', ascending=False) # to sort the table by odds ratio, descending
# a feature is not significant if its odds ratio is close to 1 or its coefficient is around 0

# we can see that reason 3 (poisoning related) has the highest odds ratio, following reason 1 (various diseases), reason 2 (pregnancy related), and reason 4 (light diseases).

# and daily_work_load_average, distance_to_work, day_of_the_week seem to be not significant. (near 1 odd ratio)
#* so lets remove them (it is called backward elimination) -they are removed in the first block of code in this notebook, with the column: 'Absenteeism Time in Hours'


Unnamed: 0,Feature Name,Coefficient,Odds Ratio
3,Reason_3,3.114061,22.51227
1,Reason_1,2.800006,16.444753
2,Reason_2,0.951748,2.590233
4,Reason_4,0.838359,2.31257
6,Transportation Expense,0.605137,1.831503
10,Children,0.348424,1.416833
8,Body Mass Index,0.279982,1.323106
5,Month Value,0.158977,1.172311
7,Age,-0.169906,0.843744
9,Education,-0.210174,0.810443


### Testing the Model

In [8]:
 #* end of the ml process, since once tested (once it sees the test data), we are not allowed to touch the model anymore

reg.score(x_test, y_test) # testing accuracy, gives 0.75 = 75% accuracy = 75% of the outputs match the target

predicted_proba = reg.predict_proba(x_test) # to see the probabilities of the outputs, gives a 2D array with 2 columns, one for each class. (their sum makes 1, since it is PROBABILTY of being 0 or being 1.)
predicted_proba
predicted_proba[:, 1]

array([[0.71342516, 0.28657484],
       [0.5873216 , 0.4126784 ],
       [0.44016153, 0.55983847],
       [0.78163061, 0.21836939],
       [0.08407928, 0.91592072],
       [0.3348226 , 0.6651774 ],
       [0.29971206, 0.70028794],
       [0.13112385, 0.86887615],
       [0.78627908, 0.21372092],
       [0.74906578, 0.25093422],
       [0.49395555, 0.50604445],
       [0.22492002, 0.77507998],
       [0.07135527, 0.92864473],
       [0.73173354, 0.26826646],
       [0.30957854, 0.69042146],
       [0.54726422, 0.45273578],
       [0.55051921, 0.44948079],
       [0.53926379, 0.46073621],
       [0.40197149, 0.59802851],
       [0.05365482, 0.94634518],
       [0.70030387, 0.29969613],
       [0.78163061, 0.21836939],
       [0.42028246, 0.57971754],
       [0.42028246, 0.57971754],
       [0.24801464, 0.75198536],
       [0.74567806, 0.25432194],
       [0.51026557, 0.48973443],
       [0.8569309 , 0.1430691 ],
       [0.20365204, 0.79634796],
       [0.78163061, 0.21836939],
       [0.

### Save the Model and Create a Module

In [None]:
import pickle
with open('model.pkl', 'wb') as f: #* wb: write bytes (rb while reading)
    pickle.dump(reg, f) #* dump (load while reading)

with open('scaler.pkl','wb') as file:
    pickle.dump(absenteeism_scaler, file)


#* create a module (software component containing the code that will help us execute the model)
# for reusability, done in 3absenteeism_module.py