In [477]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LinearRegression ,Ridge, Lasso, ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.svm import SVR

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import RobustScaler, PowerTransformer, PolynomialFeatures, OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from dython.nominal import associations, cramers_v, correlation_ratio, theils_u
from xgboost import XGBRegressor

pd.options.display.max_columns = 999
import warnings
warnings.filterwarnings('ignore')

In [478]:
df = pd.read_csv('Absenteeism.csv', sep = ';')

In [479]:
df.drop(columns = 'ID', inplace = True)

In [480]:
df

Unnamed: 0,Reason for absence,Month of absence,Day of the week,Seasons,Transportation expense,Distance from Residence to Work,Service time,Age,Work load Average/day,Hit target,Disciplinary failure,Education,Son,Social drinker,Social smoker,Pet,Weight,Height,Body mass index,Absenteeism time in hours
0,26,7,3,1,289,36,13,33,239.554,97,0,1,2,1,0,1,90,172,30,4
1,0,7,3,1,118,13,18,50,239.554,97,1,1,1,1,0,0,98,178,31,0
2,23,7,4,1,179,51,18,38,239.554,97,0,1,0,1,0,0,89,170,31,2
3,7,7,5,1,279,5,14,39,239.554,97,0,1,2,1,1,0,68,168,24,4
4,23,7,5,1,289,36,13,33,239.554,97,0,1,2,1,0,1,90,172,30,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
735,14,7,3,1,289,36,13,33,264.604,93,0,1,2,1,0,1,90,172,30,8
736,11,7,3,1,235,11,14,37,264.604,93,0,3,1,0,0,1,88,172,29,4
737,0,0,3,1,118,14,13,40,271.219,95,0,1,1,1,0,8,98,170,34,0
738,0,0,4,2,231,35,14,39,271.219,95,0,1,2,1,0,2,100,170,35,0


In [481]:
num_col = []
for i in df.columns:
    num_col.append(i)

In [482]:
num_col.remove('Absenteeism time in hours')

In [483]:
num_col.remove('Reason for absence')

In [484]:
X = df.drop(columns= 'Absenteeism time in hours')
y = df['Absenteeism time in hours']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2, random_state = 42)

In [485]:
# Base model
preprocessor = ColumnTransformer([
    ('ohs', OneHotEncoder(handle_unknown ='ignore' ), ['Reason for absence'])
    
],remainder = 'passthrough')

In [486]:
pipeline_LR = Pipeline([
    ('prep', preprocessor),
    ('algo', LinearRegression())
])
pipeline_KNN = Pipeline([
    ('prep', preprocessor),
    ('algo', KNeighborsRegressor())
])
pipeline_RF =  Pipeline([
    ('prep', preprocessor),
    ('algo', RandomForestRegressor())
])

In [487]:
pipeline_LR.fit(X_train,y_train)

Pipeline(steps=[('prep',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('ohs',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['Reason for absence'])])),
                ('algo', LinearRegression())])

In [488]:
def Eva_matrix(Model, X_train, y_train, X_test, y_test, Name):
    y_pred_tr = Model.predict(X_train)
    r2_tr = r2_score(y_train, y_pred_tr)
    mae_tr = mean_absolute_error(y_train, y_pred_tr)
    mse_tr = mean_squared_error(y_train, y_pred_tr)
    rmse_tr = np.sqrt(mse_tr)
    y_ts = Model.predict(X_test)
    r2_ts = r2_score(y_test, y_ts)
    mae_ts = mean_absolute_error(y_test, y_ts)
    mse_ts = mean_squared_error(y_test, y_ts)
    rmse_ts = np.sqrt(mse_ts)
    data = {
        f"Train {Name}" : [r2_tr, mae_tr, mse_tr, rmse_tr],
        f"Test {Name}" : [r2_ts, mae_ts, mse_ts, rmse_ts]
    }
    df = pd.DataFrame(data, index=['R2', 'MAE', 'MSE', 'RMSE'])
    return df

In [489]:
df_LR_base = Eva_matrix(pipeline_LR, X_train, y_train, X_test, y_test, "Base LR")
df_LR_base

Unnamed: 0,Train Base LR,Test Base LR
R2,0.28124,-3.100002e+20
MAE,5.69977,21366130000.0
MSE,139.740688,3.378186e+22
RMSE,11.821197,183798400000.0


In [490]:
pipeline_KNN.fit(X_train,y_train)

Pipeline(steps=[('prep',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('ohs',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['Reason for absence'])])),
                ('algo', KNeighborsRegressor())])

In [491]:
df_KNN_base = Eva_matrix(pipeline_KNN, X_train, y_train, X_test, y_test, "Base KNN")
df_KNN_base

Unnamed: 0,Train Base KNN,Test Base KNN
R2,0.265591,-0.288491
MAE,5.265878,5.95
MSE,142.783311,140.411622
RMSE,11.949197,11.849541


In [492]:
pipeline_RF.fit(X_train,y_train)

Pipeline(steps=[('prep',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('ohs',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['Reason for absence'])])),
                ('algo', RandomForestRegressor())])

In [493]:
df_RF_base = Eva_matrix(pipeline_RF, X_train, y_train ,X_test, y_test, "Base RF")
df_RF_base

Unnamed: 0,Train Base RF,Test Base RF
R2,0.84874,-0.105257
MAE,2.169381,4.634961
MSE,29.407875,120.443999
RMSE,5.422903,10.974698


# feature engineering

In [494]:
# dropping weight and height -> irrelevant. body mass index is more accurate

In [495]:
df_ml = df.copy()

In [496]:
df_ml.drop(columns = ['Height', 'Weight'], inplace = True)

In [497]:
X_1 = df_ml.drop(columns= 'Absenteeism time in hours')
y_1= df_ml['Absenteeism time in hours']
X_train2, X_test2, y_train2, y_test2 = train_test_split(X_1, y_1, test_size= 0.2, random_state = 42)

In [498]:
num_col.remove('Weight')
num_col.remove('Height')

In [499]:
num_col

['Month of absence',
 'Day of the week',
 'Seasons',
 'Transportation expense',
 'Distance from Residence to Work',
 'Service time',
 'Age',
 'Work load Average/day ',
 'Hit target',
 'Disciplinary failure',
 'Education',
 'Son',
 'Social drinker',
 'Social smoker',
 'Pet',
 'Body mass index']

In [500]:
num_pipeline = Pipeline([
    ('scale', RobustScaler()),
    ('poly', PolynomialFeatures(degree =2)),
    ('power', PowerTransformer(method = 'yeo-johnson'))
    
    
])

In [501]:
# Feature engineering model
preprocessor = ColumnTransformer([
    ('ohs', OneHotEncoder(handle_unknown ='ignore' ), ['Reason for absence']),
    ('num', num_pipeline, num_col)
    
],remainder = 'passthrough')

In [502]:
pipeline_LR = Pipeline([
    ('prep', preprocessor),
    ('algo', LinearRegression())
])
pipeline_KNN = Pipeline([
    ('prep', preprocessor),
    ('algo', KNeighborsRegressor())
])
pipeline_RF =  Pipeline([
    ('prep', preprocessor),
    ('algo', RandomForestRegressor())
])
pipeline_XGB= Pipeline([
    ('prep', preprocessor),
    ('algo', XGBRegressor())
])
pipeline_SVM = Pipeline([
    ('prep', preprocessor),
    ('algo', SVR())
])

In [503]:
pipeline_LR.fit(X_train2, y_train2)

Pipeline(steps=[('prep',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('ohs',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['Reason for absence']),
                                                 ('num',
                                                  Pipeline(steps=[('scale',
                                                                   RobustScaler()),
                                                                  ('poly',
                                                                   PolynomialFeatures()),
                                                                  ('power',
                                                                   PowerTransformer())]),
                                                  ['Month of absence',
                                                   'Day of the week', 'Sea

In [504]:
df_LR_base = Eva_matrix(pipeline_LR, X_train2, y_train2, X_test2, y_test2, "Base LR")
df_LR_base

Unnamed: 0,Train Base LR,Test Base LR
R2,0.441364,-0.980314
MAE,5.77788,8.848079
MSE,108.609617,215.802179
RMSE,10.421594,14.690207


In [505]:
pipeline_KNN.fit(X_train2,y_train2)

Pipeline(steps=[('prep',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('ohs',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['Reason for absence']),
                                                 ('num',
                                                  Pipeline(steps=[('scale',
                                                                   RobustScaler()),
                                                                  ('poly',
                                                                   PolynomialFeatures()),
                                                                  ('power',
                                                                   PowerTransformer())]),
                                                  ['Month of absence',
                                                   'Day of the week', 'Sea

In [506]:
df_KNN_base = Eva_matrix(pipeline_KNN, X_train2, y_train2, X_test2, y_test2, "Base KNN")
df_KNN_base

Unnamed: 0,Train Base KNN,Test Base KNN
R2,0.34915,-0.136979
MAE,5.049662,5.639189
MSE,126.53777,123.900811
RMSE,11.248901,11.131074


In [507]:
pipeline_RF.fit(X_train2,y_train2)

Pipeline(steps=[('prep',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('ohs',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['Reason for absence']),
                                                 ('num',
                                                  Pipeline(steps=[('scale',
                                                                   RobustScaler()),
                                                                  ('poly',
                                                                   PolynomialFeatures()),
                                                                  ('power',
                                                                   PowerTransformer())]),
                                                  ['Month of absence',
                                                   'Day of the week', 'Sea

In [508]:
df_RF_base = Eva_matrix(pipeline_RF, X_train2, y_train2,X_test2, y_test2, "Base RF")
df_RF_base

Unnamed: 0,Train Base RF,Test Base RF
R2,0.85323,-0.046231
MAE,2.270547,5.216936
MSE,28.534818,114.011633
RMSE,5.341799,10.677623


In [509]:
pipeline_SVM.fit(X_train2, y_train2)

Pipeline(steps=[('prep',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('ohs',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['Reason for absence']),
                                                 ('num',
                                                  Pipeline(steps=[('scale',
                                                                   RobustScaler()),
                                                                  ('poly',
                                                                   PolynomialFeatures()),
                                                                  ('power',
                                                                   PowerTransformer())]),
                                                  ['Month of absence',
                                                   'Day of the week', 'Sea

In [510]:
df_SVM_base = Eva_matrix(pipeline_SVM, X_train2, y_train2,X_test2, y_test2, "Base SVM")
df_SVM_base

Unnamed: 0,Train Base SVM,Test Base SVM
R2,-0.001491,0.004644
MAE,4.50712,4.126768
MSE,194.709105,108.467618
RMSE,13.95382,10.414779


In [511]:
pipeline_XGB.fit(X_train2, y_train2)

Pipeline(steps=[('prep',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('ohs',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['Reason for absence']),
                                                 ('num',
                                                  Pipeline(steps=[('scale',
                                                                   RobustScaler()),
                                                                  ('poly',
                                                                   PolynomialFeatures()),
                                                                  ('power',
                                                                   PowerTransformer())]),
                                                  ['Month of absence',
                                                   'Day of the week', 'Sea

In [512]:
df_XGB_base = Eva_matrix(pipeline_XGB, X_train2, y_train2,X_test2, y_test2, "Base XGB")
df_XGB_base

Unnamed: 0,Train Base XGB,Test Base XGB
R2,0.997347,-1.545956
MAE,0.248519,6.433788
MSE,0.51584,277.442239
RMSE,0.71822,16.656597


In [513]:
# Ridge
pipeline_ridge = Pipeline([
    ('prep', preprocessor),
    ('algo', Ridge())
])

In [514]:
pipeline_ridge.fit(X_train2, y_train2)

Pipeline(steps=[('prep',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('ohs',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['Reason for absence']),
                                                 ('num',
                                                  Pipeline(steps=[('scale',
                                                                   RobustScaler()),
                                                                  ('poly',
                                                                   PolynomialFeatures()),
                                                                  ('power',
                                                                   PowerTransformer())]),
                                                  ['Month of absence',
                                                   'Day of the week', 'Sea

In [515]:
df_ridge_base = Eva_matrix(pipeline_ridge, X_train2, y_train2,X_test2, y_test2, "Base ridge")
df_ridge_base

Unnamed: 0,Train Base ridge,Test Base ridge
R2,0.443227,-0.770648
MAE,5.592171,8.321479
MSE,108.247445,192.954026
RMSE,10.404203,13.890789


In [516]:
# Lasso

In [517]:
pipeline_lasso = Pipeline([
    ('prep', preprocessor),
    ('algo', Lasso())
])


In [518]:
pipeline_lasso.fit(X_train2, y_train2)

Pipeline(steps=[('prep',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('ohs',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['Reason for absence']),
                                                 ('num',
                                                  Pipeline(steps=[('scale',
                                                                   RobustScaler()),
                                                                  ('poly',
                                                                   PolynomialFeatures()),
                                                                  ('power',
                                                                   PowerTransformer())]),
                                                  ['Month of absence',
                                                   'Day of the week', 'Sea

In [519]:
df_lasso_base = Eva_matrix(pipeline_lasso, X_train2, y_train2,X_test2, y_test2, "Base lasso")
df_lasso_base

Unnamed: 0,Train Base lasso,Test Base lasso
R2,0.064475,-0.005068
MAE,5.969322,5.521167
MSE,181.884087,109.525974
RMSE,13.486441,10.465466


# Hyperparameter

In [520]:
param_KNN = {
    'algo__n_neighbors': np.arange(2,51,2),
    "algo__p" : [1,2],
    'algo__weights' : ['uniform', 'distance']
}

In [521]:
KNN_GS = GridSearchCV(pipeline_KNN, param_KNN, cv = 3, scoring = 'r2', n_jobs= -1, verbose = 1)

In [522]:
KNN_GS.fit(X_train2, y_train2)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    2.3s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   10.0s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:   16.4s finished


GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('prep',
                                        ColumnTransformer(remainder='passthrough',
                                                          transformers=[('ohs',
                                                                         OneHotEncoder(handle_unknown='ignore'),
                                                                         ['Reason '
                                                                          'for '
                                                                          'absence']),
                                                                        ('num',
                                                                         Pipeline(steps=[('scale',
                                                                                          RobustScaler()),
                                                                                         ('poly',
                     

In [523]:
KNN_GS.best_params_

{'algo__n_neighbors': 44, 'algo__p': 2, 'algo__weights': 'distance'}

In [524]:
KNN_T = KNN_GS.best_estimator_

In [525]:
df_KNN_T = Eva_matrix(KNN_T, X_train2, y_train2, X_test2, y_test2, "KNN T")
df_KNN_T

Unnamed: 0,Train KNN T,Test KNN T
R2,0.997651,-0.18998
MAE,0.087838,5.335458
MSE,0.456644,129.676504
RMSE,0.675754,11.387559


# Fine tuning SVM

In [527]:
param_SVM = {
    'algo__C' : np.logspace(-3,3,7),
    'algo__gamma' : np.arange(10, 101, 10)
}

In [528]:
SVM_GS = GridSearchCV(pipeline_SVM, param_SVM, cv = 3, scoring = 'r2', n_jobs= -1, verbose = 1)

In [529]:
SVM_GS.fit(X_train2, y_train2)

Fitting 3 folds for each of 70 candidates, totalling 210 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    2.2s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   10.8s
[Parallel(n_jobs=-1)]: Done 210 out of 210 | elapsed:   12.3s finished


GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('prep',
                                        ColumnTransformer(remainder='passthrough',
                                                          transformers=[('ohs',
                                                                         OneHotEncoder(handle_unknown='ignore'),
                                                                         ['Reason '
                                                                          'for '
                                                                          'absence']),
                                                                        ('num',
                                                                         Pipeline(steps=[('scale',
                                                                                          RobustScaler()),
                                                                                         ('poly',
                     

In [530]:
SVM_T = SVM_GS.best_estimator_

In [531]:
df_SVM_T = Eva_matrix(SVM_T, X_train2, y_train2, X_test2, y_test2, "SVM T")
df_SVM_T

Unnamed: 0,Train SVM T,Test SVM T
R2,0.991932,-0.210786
MAE,0.258228,5.439342
MSE,1.568621,131.943865
RMSE,1.252446,11.486682


# RF fine tuning

In [532]:
pipeline_RF_fine =  Pipeline([
    ('prep', preprocessor),
    ('algo', RandomForestRegressor(n_estimators = 500, 
    max_depth = 6, 
    max_features = 0.8))
]) 

In [533]:
pipeline_RF_fine.fit(X_train2,y_train2)

Pipeline(steps=[('prep',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('ohs',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['Reason for absence']),
                                                 ('num',
                                                  Pipeline(steps=[('scale',
                                                                   RobustScaler()),
                                                                  ('poly',
                                                                   PolynomialFeatures()),
                                                                  ('power',
                                                                   PowerTransformer())]),
                                                  ['Month of absence',
                                                   'Day of the week', 'Sea

In [534]:
df_RF_T = Eva_matrix(pipeline_RF_fine, X_train2, y_train2, X_test2, y_test2, "RF T")
df_RF_T

Unnamed: 0,Train RF T,Test RF T
R2,0.747162,0.003245
MAE,4.029241,5.372985
MSE,49.156641,108.620089
RMSE,7.01118,10.422096


# conclusion

We use accuracy to find the best model possible in prediction. The RMSE for random forest fine tuned also has the lowest score = 10.4

The best algoritm is Random forest fine tuning with R2 score 0.74, RMSE = 10.4. The model is overfit so i recommend the company to gather more health data about the employee such as (glucose level, cholestrol, uric acid, blood pressure and so on) and the data needs to be updated every terms. So that it gave more accurate readings of the prediction. Healthy employee leads to increase in productivity.
Example:
- cost of medical wellness program : 5 usd per employee
    : 10 usd x 740 employee = 7400 usd per term
 assumption:
- productivity increase = leads to more profit for the business
- increase 20% of productivity -> increase 20% of profit
- if the mean profit every terms (quartile ) =  1000000 USD  -> 20% increase = 1200000.
- so profit for the company increase -> 1200000 - 7400 = in percentage = 16.3 %