# Task: Predict absenteeism from work

What we want to do: Explore whether or not a specific known reason for absence induces an individual to be excessively absent from work. That’s why we don’t really need to keep in our data set information about someone who has been away due to an unknown reason.

In [1]:
import pandas as pd
import numpy as np

In [2]:
raw_csv_data = pd.read_csv('Absenteeism_data.csv')

## Data Preprocessing

In [3]:
# Making a copy of initial data

df = raw_csv_data.copy()
df.head()

Unnamed: 0,ID,Reason for Absence,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,11,26,07/07/2015,289,36,33,239.554,30,1,2,1,4
1,36,0,14/07/2015,118,13,50,239.554,31,1,1,0,0
2,3,23,15/07/2015,179,51,38,239.554,31,1,0,0,2
3,7,7,16/07/2015,279,5,39,239.554,24,1,2,0,4
4,11,23,23/07/2015,289,36,33,239.554,30,1,2,1,2


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 700 entries, 0 to 699
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   ID                         700 non-null    int64  
 1   Reason for Absence         700 non-null    int64  
 2   Date                       700 non-null    object 
 3   Transportation Expense     700 non-null    int64  
 4   Distance to Work           700 non-null    int64  
 5   Age                        700 non-null    int64  
 6   Daily Work Load Average    700 non-null    float64
 7   Body Mass Index            700 non-null    int64  
 8   Education                  700 non-null    int64  
 9   Children                   700 non-null    int64  
 10  Pets                       700 non-null    int64  
 11  Absenteeism Time in Hours  700 non-null    int64  
dtypes: float64(1), int64(10), object(1)
memory usage: 65.8+ KB


In [5]:
# Drop unnecessary columns such as ID (Employee Identification) as it is a label variable

df = df.drop(['ID'], axis = 1)

### "Reason for Absence"

* Explore/Analyze attribute and it's values
* Convert Categorical Nominal variable into dummy variables
* Validate newly created reason columns using 'check'
* Drop 'check' column after validation
* Drop Reason 0 as it is "Unknown"
* Drop 'Reason for absence' from DataFrame to avoid multicollinearity
* Classification of Reasons into 4 Types to reduce number of attributes
* Concatenate reason types and original dataframe

In [6]:
# Exploration of Reason for Absence

print(df['Reason for Absence'].min())
print(df['Reason for Absence'].max())
print(df['Reason for Absence'].unique())
print(len((df['Reason for Absence'].unique())))
print(sorted(df['Reason for Absence'].unique()))


0
28
[26  0 23  7 22 19  1 11 14 21 10 13 28 18 25 24  6 27 17  8 12  5  9 15
  4  3  2 16]
28
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 21, 22, 23, 24, 25, 26, 27, 28]


In [7]:
reason_columns = pd.get_dummies(df['Reason for Absence'])
reason_columns.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,18,19,21,22,23,24,25,26,27,28
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [8]:
#Sum should be 700 (1 times 700); Any row with 0 would imply a missing value

reason_columns['check'] = reason_columns.sum(axis=1)
print(reason_columns['check'].sum(axis=0))
print(reason_columns['check'].unique())

700
[1]


In [9]:
# Drop check after validation is complete
# Drop Reason 0 as it is unknown

reason_columns = reason_columns.drop(['check'], axis = 1)
reason_colums = pd.get_dummies(df['Reason for Absence'], drop_first = True)

In [10]:
# Drop 'Reason for absence' from df to avoid multicollinearity

df = df.drop(['Reason for Absence'], axis = 1)

In [11]:
# Classification of reason_columns
# Classification is re-organizing a certain type of variables into groups in a regression analysis

reason_type_1 = reason_columns.loc[:, 1:14].max(axis = 1)
reason_type_2 = reason_columns.loc[:, 15:17].max(axis = 1)
reason_type_3 = reason_columns.loc[:, 18:21].max(axis = 1)
reason_type_4 = reason_columns.loc[:, 22:].max(axis = 1)

In [12]:
# Concatenating the two Dataframes into one

df = pd.concat([df, reason_type_1, reason_type_2, reason_type_3, reason_type_4], axis = 1)
df.columns.values

array(['Date', 'Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets', 'Absenteeism Time in Hours', 0, 1, 2, 3],
      dtype=object)

In [13]:
# Renaming the columns 0, 1, 2, 3 with more meaningful names

column_names = ['Date', 'Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets', 'Absenteeism Time in Hours', 'Reason_1', 'Reason_2', 'Reason_3', 'Reason_4']
df.columns = column_names
df.head()

Unnamed: 0,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Reason_1,Reason_2,Reason_3,Reason_4
0,07/07/2015,289,36,33,239.554,30,1,2,1,4,0,0,0,1
1,14/07/2015,118,13,50,239.554,31,1,1,0,0,0,0,0,0
2,15/07/2015,179,51,38,239.554,31,1,0,0,2,0,0,0,1
3,16/07/2015,279,5,39,239.554,24,1,2,0,4,1,0,0,0
4,23/07/2015,289,36,33,239.554,30,1,2,1,2,0,0,0,1


In [14]:
# Reoder columns in DataFrame

column_names_reordered = ['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Date', 'Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets', 'Absenteeism Time in Hours']
df = df[column_names_reordered]
df.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,0,0,0,1,07/07/2015,289,36,33,239.554,30,1,2,1,4
1,0,0,0,0,14/07/2015,118,13,50,239.554,31,1,1,0,0
2,0,0,0,1,15/07/2015,179,51,38,239.554,31,1,0,0,2
3,1,0,0,0,16/07/2015,279,5,39,239.554,24,1,2,0,4
4,0,0,0,1,23/07/2015,289,36,33,239.554,30,1,2,1,2


### Checkpoint 1

In [15]:
df_reason_mod = df.copy()

### "Date"


*   Convert date column from string format to timestamp
* Extract Month value
* Extract Day of the Week
* Drop Date column and Reorder columns



In [16]:
# Convert data type of Date from string to Timestamp

df_reason_mod['Date'] = pd.to_datetime(df_reason_mod['Date'], format = '%d/%m/%Y')
type(df_reason_mod['Date'][0])

pandas._libs.tslibs.timestamps.Timestamp

In [17]:
# Extract Month from Date

list_months = []

for i in range(df_reason_mod.shape[0]):
    list_months.append(df_reason_mod['Date'][i].month)

df_reason_mod['Month'] = list_months
df_reason_mod.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Month
0,0,0,0,1,2015-07-07,289,36,33,239.554,30,1,2,1,4,7
1,0,0,0,0,2015-07-14,118,13,50,239.554,31,1,1,0,0,7
2,0,0,0,1,2015-07-15,179,51,38,239.554,31,1,0,0,2,7
3,1,0,0,0,2015-07-16,279,5,39,239.554,24,1,2,0,4,7
4,0,0,0,1,2015-07-23,289,36,33,239.554,30,1,2,1,2,7


In [18]:
# Extract Day of the Week from Date

df_reason_mod['Weekday'] = df_reason_mod['Date'].dt.dayofweek
df_reason_mod.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Month,Weekday
0,0,0,0,1,2015-07-07,289,36,33,239.554,30,1,2,1,4,7,1
1,0,0,0,0,2015-07-14,118,13,50,239.554,31,1,1,0,0,7,1
2,0,0,0,1,2015-07-15,179,51,38,239.554,31,1,0,0,2,7,2
3,1,0,0,0,2015-07-16,279,5,39,239.554,24,1,2,0,4,7,3
4,0,0,0,1,2015-07-23,289,36,33,239.554,30,1,2,1,2,7,3


In [19]:
# Drop Date column
# Reorder Month and Weekday columns

df_reason_mod = df_reason_mod.drop(['Date'], axis = 1)

column_names_reordered = ['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Month','Weekday', 'Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets', 'Absenteeism Time in Hours']
df_reason_mod = df_reason_mod[column_names_reordered]
df_reason_mod.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month,Weekday,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,0,0,0,1,7,1,289,36,33,239.554,30,1,2,1,4
1,0,0,0,0,7,1,118,13,50,239.554,31,1,1,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,1,0,0,2
3,1,0,0,0,7,3,279,5,39,239.554,24,1,2,0,4
4,0,0,0,1,7,3,289,36,33,239.554,30,1,2,1,2


### Checkpoint 2

In [20]:
df_reason_date_mod = df_reason_mod.copy()

### "Education"

* Explore/Analyze attribute and it's values
* Combine higher than high school graduates into one category using dictionaries

In [21]:
# Exploration of Education

print(df_reason_date_mod['Education'].unique())
print(df_reason_date_mod['Education'].value_counts())

[1 3 2 4]
1    583
3     73
2     40
4      4
Name: Education, dtype: int64


In [22]:
# Combine higher than high school graduates into one category using dictionaries

df_reason_date_mod['Education'] = df_reason_date_mod['Education'].map({1:0, 2:1, 3:1, 4:1})

print(df_reason_date_mod['Education'].unique())
print(df_reason_date_mod['Education'].value_counts())

[0 1]
0    583
1    117
Name: Education, dtype: int64


### Final Checkpoint

In [23]:
# Export data as a .csv file

df_preprocessed = df_reason_date_mod.copy()

df_preprocessed.to_csv('Absenteeism_preprocessed.csv', index=False)

df_preprocessed.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month,Weekday,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,2
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,4
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,2


### Load the data

In [24]:
data_preprocessed = pd.read_csv('Absenteeism_preprocessed.csv')
data_preprocessed.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month,Weekday,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,2
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,4
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,2


## Data Preprocessing
Create the Targets

* Median value of Absenteeism Time in hours will be used as cut-off line
* Anything below the median will be considered as Moderately absent
* Anything above the median will be considered as Excessively absent

In [25]:
# Find the median

data_preprocessed['Absenteeism Time in Hours'].median()

3.0

In [26]:
targets = np.where(data_preprocessed['Absenteeism Time in Hours'] >
                   data_preprocessed['Absenteeism Time in Hours'].median(), 1, 0)
targets

array([1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0,
       1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1,
       1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0,
       0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0,
       0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0,

In [27]:
# Concatenating DataFrame

data_preprocessed['Excessive Absenteeism'] = targets
data_preprocessed.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month,Weekday,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Excessive Absenteeism
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4,1
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,2,0
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,4,1
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,2,0


### Checkpoint 1

In [28]:
# Creating a copy while also dropping redundant column

data_with_targets = data_preprocessed.drop(['Absenteeism Time in Hours'], axis = 1)

### Select the inputs for the regression

In [29]:
unscaled_inputs = data_with_targets.iloc[:, :-1]
unscaled_inputs.columns.values

array(['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Month', 'Weekday',
       'Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets'], dtype=object)

### Standardize the data

In [30]:
# Preparing the scaling mechanism to standardize all but the dummy attributes
# Import the relevant module

from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin

# absenteeism_scaler = StandardScaler()
# absenteeism_scaler.fit(unscaled_inputs)

class CustomScaler(BaseEstimator, TransformerMixin):

  def __init__(self, columns, copy = True, with_mean = True, with_std = True):
    self.scaler = StandardScaler()
    self.columns = columns
    self.copy = copy
    self.with_mean = None
    self.with_std = None

  def fit(self, X, y = None):
    self.scaler.fit(X[self.columns], y)
    self.mean_ = np.mean(X[self.columns])
    self.var_ = np.var(X[self.columns])
    return self

  def transform(self, X, y = None, copy = None):
    init_col_order = X.columns
    X_scaled = pd.DataFrame(self.scaler.transform(X[self.columns]), columns = self.columns)
    X_not_scaled = X.loc[:, ~X.columns.isin(self.columns)]
    return pd.concat([X_not_scaled, X_scaled], axis = 1)[init_col_order]

In [31]:
columns_to_scale = ['Month', 'Weekday','Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index', 'Children', 'Pets']

In [32]:
absenteeism_scaler = CustomScaler(columns_to_scale)

In [33]:
absenteeism_scaler.fit(unscaled_inputs)

  return mean(axis=axis, dtype=dtype, out=out, **kwargs)


In [34]:
# Applying the scaling mechanism

scaled_inputs = absenteeism_scaler.transform(unscaled_inputs)

In [35]:
print(scaled_inputs)

     Reason_1  Reason_2  Reason_3  Reason_4     Month   Weekday  \
0           0         0         0         1  0.182726 -0.683704   
1           0         0         0         0  0.182726 -0.683704   
2           0         0         0         1  0.182726 -0.007725   
3           1         0         0         0  0.182726  0.668253   
4           0         0         0         1  0.182726  0.668253   
..        ...       ...       ...       ...       ...       ...   
695         1         0         0         0 -0.388293 -0.007725   
696         1         0         0         0 -0.388293 -0.007725   
697         1         0         0         0 -0.388293  0.668253   
698         0         0         0         1 -0.388293  0.668253   
699         0         0         0         1 -0.388293  0.668253   

     Transportation Expense  Distance to Work       Age  \
0                  1.005844          0.412816 -0.536062   
1                 -1.574681         -1.141882  2.130803   
2                 

### Splitting the data for Training and Testing

In [36]:
# Import the relevant module

from sklearn.model_selection import train_test_split

In [37]:
# Performing the split

x_train, x_test, y_train, y_test = train_test_split(scaled_inputs, targets, train_size=0.8, random_state = 20)

# Observing the split

print(x_train.shape, y_train.shape)
print(x_test.shape, y_test.shape)

(560, 14) (560,)
(140, 14) (140,)


## Modelling

In [38]:
# Import the relevant module

from sklearn.linear_model import LogisticRegression
from sklearn import metrics

In [39]:
# Training the model

reg = LogisticRegression()
reg.fit(x_train, y_train)

In [40]:
# Evaluate model accuracy

reg.score(x_train, y_train)

0.775

In [41]:
# Finding the intercept and coefficients

reg.intercept_

array([-1.6561092])

In [42]:
reg.coef_

array([[ 2.80096498e+00,  9.34857518e-01,  3.09561645e+00,
         8.56587468e-01,  1.66248119e-01, -8.43703301e-02,
         6.12732578e-01, -7.79685996e-03, -1.65922708e-01,
        -1.47005122e-04,  2.71811477e-01, -2.05738037e-01,
         3.61989880e-01, -2.85510745e-01]])

In [43]:
features = unscaled_inputs.columns.values

summary_table = pd.DataFrame (columns = ['Features'], data = features)
summary_table['Coefficient'] = np.transpose(reg.coef_)

summary_table

Unnamed: 0,Features,Coefficient
0,Reason_1,2.800965
1,Reason_2,0.934858
2,Reason_3,3.095616
3,Reason_4,0.856587
4,Month,0.166248
5,Weekday,-0.08437
6,Transportation Expense,0.612733
7,Distance to Work,-0.007797
8,Age,-0.165923
9,Daily Work Load Average,-0.000147


In [44]:
# Adding intercept to summary table

summary_table.index = summary_table.index + 1
summary_table.loc[0] = ['Intercept', reg.intercept_[0]]

summary_table = summary_table.sort_index()
summary_table

Unnamed: 0,Features,Coefficient
0,Intercept,-1.656109
1,Reason_1,2.800965
2,Reason_2,0.934858
3,Reason_3,3.095616
4,Reason_4,0.856587
5,Month,0.166248
6,Weekday,-0.08437
7,Transportation Expense,0.612733
8,Distance to Work,-0.007797
9,Age,-0.165923


### Interpretation of the standardized coefficients

In [46]:
# Create Odds ratio

summary_table['Odds_ratio'] = np.exp(summary_table.Coefficient)
summary_table.sort_values('Odds_ratio', ascending = False)

Unnamed: 0,Features,Coefficient,Odds_ratio
3,Reason_3,3.095616,22.100858
1,Reason_1,2.800965,16.460523
2,Reason_2,0.934858,2.546851
4,Reason_4,0.856587,2.35511
7,Transportation Expense,0.612733,1.845467
13,Children,0.36199,1.436184
11,Body Mass Index,0.271811,1.31234
5,Month,0.166248,1.180866
10,Daily Work Load Average,-0.000147,0.999853
8,Distance to Work,-0.007797,0.992233


A feature is **NOT** particularly important **IF**:

*   *Coefficient is around 0*
*   *Odds ratio is around 1*

For example, Daily work load average has a coefficient of -0.004 and odds ratio of 0.995.
Day of the Week and Distance to Work seem to be the ones that make no difference, given all the features.

The four reasons for absence are the most important features!


1.   **Reason 3: Poisoning** Seems understandable as this would be a severe issue.
2.   **Reason 1: Various diseases** 14 times more likely to be absent than a person who gave no reason
3.   **Reason 2: Pregnancy and giving birth** Prominant cause but not as high as Reason 3 or 1.
4.  **Reason 4: Light diseases**

**Transportation expense** is a standardized variable. The odds ratio of 1.8 suggests that for one standard deviation increase in the attribute, the odds of being excessively absent increases by a factor of almost 2.

Meanwhile, **Pet** is a standardized continuous variable with an Odds ratio of 0.76. For each unit of increase in Pet, the odds of being excessively absent decreases by a factor of 0.24 (1-0.76).

The **intercept** simply callibrates the model.


## Testing the model

In [47]:
reg.score(x_test, y_test)

0.7428571428571429

In [48]:
predicted_proba = reg.predict_proba(x_test)
predicted_proba

array([[0.73838887, 0.26161113],
       [0.60860095, 0.39139905],
       [0.40910176, 0.59089824],
       [0.80489361, 0.19510639],
       [0.0732329 , 0.9267671 ],
       [0.31965834, 0.68034166],
       [0.31302205, 0.68697795],
       [0.13341719, 0.86658281],
       [0.79712508, 0.20287492],
       [0.75274419, 0.24725581],
       [0.48222467, 0.51777533],
       [0.1964133 , 0.8035867 ],
       [0.07857533, 0.92142467],
       [0.70622367, 0.29377633],
       [0.30708515, 0.69291485],
       [0.57055326, 0.42944674],
       [0.54143955, 0.45856045],
       [0.57205946, 0.42794054],
       [0.38194051, 0.61805949],
       [0.04857923, 0.95142077],
       [0.6977753 , 0.3022247 ],
       [0.79578125, 0.20421875],
       [0.3949288 , 0.6050712 ],
       [0.42248618, 0.57751382],
       [0.26634773, 0.73365227],
       [0.75608758, 0.24391242],
       [0.51088279, 0.48911721],
       [0.86807166, 0.13192834],
       [0.20221381, 0.79778619],
       [0.78635626, 0.21364374],
       [0.

In [49]:
predicted_proba.shape

(140, 2)

## Save the model

In [50]:
# Import relevant modules

import pickle

In [51]:
#

with open('model', 'wb') as file:
  pickle.dump(reg, file)

In [52]:
# Pickle the scaler

with open('scaler', 'wb') as file1:
  pickle.dump(absenteeism_scaler, file1)

## Deployment of Model

In [54]:
from absenteeism_module import *

In [53]:
pd.read_csv('Absenteeism_new_data.csv')

Unnamed: 0,ID,Reason for Absence,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,22,27,01/06/2018,179,26,30,237.656,19,3,0,0,
1,10,7,04/06/2018,361,52,28,237.656,27,1,1,4,
2,14,23,06/06/2018,155,12,34,237.656,25,1,2,0,
3,17,25,08/06/2018,179,22,40,237.656,22,2,2,0,
4,14,10,08/06/2018,155,12,34,237.656,25,1,2,0,
5,28,11,11/06/2018,225,26,28,237.656,24,1,1,2,
6,16,7,13/06/2018,118,15,46,275.089,25,1,2,0,
7,22,27,13/06/2018,179,26,30,275.089,19,3,0,0,
8,34,26,15/06/2018,118,10,37,275.089,28,1,0,0,
9,34,10,20/06/2018,118,10,37,275.089,28,1,0,0,


In [55]:
model = absenteeism_model('model','scaler')

In [56]:
model.load_and_clean_data('Absenteeism_new_data.csv')

In [57]:
model.predicted_outputs()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month,Weekday,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Probability,Prediction
0,0,0.0,0,1,6,4,179,26,30,237.656,19,1,0,0,0.116467,0
1,1,0.0,0,0,6,0,361,52,28,237.656,27,0,1,4,0.873275,1
2,0,0.0,0,1,6,2,155,12,34,237.656,25,0,2,0,0.270874,0
3,0,0.0,0,1,6,4,179,22,40,237.656,22,1,2,0,0.191342,0
4,1,0.0,0,0,6,4,155,12,34,237.656,25,0,2,0,0.698491,1
5,1,0.0,0,0,6,0,225,26,28,237.656,24,0,1,2,0.728028,1
6,1,0.0,0,0,6,2,118,15,46,275.089,25,0,2,0,0.573952,1
7,0,0.0,0,1,6,2,179,26,30,275.089,19,1,0,0,0.128712,0
8,0,0.0,0,1,6,4,118,10,37,275.089,28,0,0,0,0.120982,0
9,1,0.0,0,0,6,2,118,10,37,275.089,28,0,0,0,0.518811,1


In [58]:
model.predicted_outputs().to_csv('Absenteeism_predictions.csv', index = False)