In [200]:
import numpy as np
import pandas as pd
import seaborn as sbn
import matplotlib.pyplot as plt

sbn.set_theme()

In [201]:
rawdata = pd.read_csv('Absenteeism_data.csv')
rawdata.head()

Unnamed: 0,ID,Reason for Absence,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,11,26,07/07/2015,289,36,33,239.554,30,1,2,1,4
1,36,0,14/07/2015,118,13,50,239.554,31,1,1,0,0
2,3,23,15/07/2015,179,51,38,239.554,31,1,0,0,2
3,7,7,16/07/2015,279,5,39,239.554,24,1,2,0,4
4,11,23,23/07/2015,289,36,33,239.554,30,1,2,1,2


In [202]:
rawdata.describe(include='all')

Unnamed: 0,ID,Reason for Absence,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
count,700.0,700.0,700,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0
unique,,,432,,,,,,,,,
top,,,17/08/2015,,,,,,,,,
freq,,,5,,,,,,,,,
mean,17.951429,19.411429,,222.347143,29.892857,36.417143,271.801774,26.737143,1.282857,1.021429,0.687143,6.761429
std,11.028144,8.356292,,66.31296,14.804446,6.379083,40.021804,4.254701,0.66809,1.112215,1.166095,12.670082
min,1.0,0.0,,118.0,5.0,27.0,205.917,19.0,1.0,0.0,0.0,0.0
25%,9.0,13.0,,179.0,16.0,31.0,241.476,24.0,1.0,0.0,0.0,2.0
50%,18.0,23.0,,225.0,26.0,37.0,264.249,25.0,1.0,1.0,0.0,3.0
75%,28.0,27.0,,260.0,50.0,40.0,294.217,31.0,1.0,2.0,1.0,8.0


In [203]:
df_absenteeism = rawdata.copy()
df_absenteeism

Unnamed: 0,ID,Reason for Absence,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,11,26,07/07/2015,289,36,33,239.554,30,1,2,1,4
1,36,0,14/07/2015,118,13,50,239.554,31,1,1,0,0
2,3,23,15/07/2015,179,51,38,239.554,31,1,0,0,2
3,7,7,16/07/2015,279,5,39,239.554,24,1,2,0,4
4,11,23,23/07/2015,289,36,33,239.554,30,1,2,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...
695,17,10,23/05/2018,179,22,40,237.656,22,2,2,0,8
696,28,6,23/05/2018,225,26,28,237.656,24,1,1,2,3
697,18,10,24/05/2018,330,16,28,237.656,25,2,0,0,8
698,25,23,24/05/2018,235,16,32,237.656,25,3,0,0,2


In [204]:
rawdata.shape

(700, 12)

### Drop 'ID' column


In [205]:
df_absenteeism = df_absenteeism.drop(['ID'], axis=1)
df_absenteeism

Unnamed: 0,Reason for Absence,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,26,07/07/2015,289,36,33,239.554,30,1,2,1,4
1,0,14/07/2015,118,13,50,239.554,31,1,1,0,0
2,23,15/07/2015,179,51,38,239.554,31,1,0,0,2
3,7,16/07/2015,279,5,39,239.554,24,1,2,0,4
4,23,23/07/2015,289,36,33,239.554,30,1,2,1,2
...,...,...,...,...,...,...,...,...,...,...,...
695,10,23/05/2018,179,22,40,237.656,22,2,2,0,8
696,6,23/05/2018,225,26,28,237.656,24,1,1,2,3
697,10,24/05/2018,330,16,28,237.656,25,2,0,0,8
698,23,24/05/2018,235,16,32,237.656,25,3,0,0,2


### Split the reasons for absence into multiple dummy variables

In [206]:
df_absenteeism['Reason for Absence'].unique()

array([26,  0, 23,  7, 22, 19,  1, 11, 14, 21, 10, 13, 28, 18, 25, 24,  6,
       27, 17,  8, 12,  5,  9, 15,  4,  3,  2, 16])

In [207]:
print('min:' ,min(df_absenteeism['Reason for Absence']))
print('max:' ,max(df_absenteeism['Reason for Absence']))
print('length: ', len(df_absenteeism['Reason for Absence'].unique()))

min: 0
max: 28
length:  28


In [208]:
# mix = 0，max = 28，長度應該是29，但卻跑出28，表示有缺少一個類別 -> 20
sorted(df_absenteeism['Reason for Absence'].unique())

[0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28]

In [209]:
reason_dummies = pd.get_dummies(df_absenteeism['Reason for Absence'], drop_first=True)
reason_dummies

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,18,19,21,22,23,24,25,26,27,28
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,True,False,False,False,False,False
3,False,False,False,False,False,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,True,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,False,False,False,False,False,False,False,False,False,True,...,False,False,False,False,False,False,False,False,False,False
696,False,False,False,False,False,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
697,False,False,False,False,False,False,False,False,False,True,...,False,False,False,False,False,False,False,False,False,False
698,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,True,False,False,False,False,False


In [210]:
reason_dummies['check'] = reason_dummies.sum(axis=1)

In [211]:
reason_dummies

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,19,21,22,23,24,25,26,27,28,check
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,True,False,False,1
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,0
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,True,False,False,False,False,False,1
3,False,False,False,False,False,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,1
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,True,False,False,False,False,False,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,False,False,False,False,False,False,False,False,False,True,...,False,False,False,False,False,False,False,False,False,1
696,False,False,False,False,False,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,1
697,False,False,False,False,False,False,False,False,False,True,...,False,False,False,False,False,False,False,False,False,1
698,False,False,False,False,False,False,False,False,False,False,...,False,False,False,True,False,False,False,False,False,1


In [212]:
print(reason_dummies['check'].sum(axis=0))
print(reason_dummies['check'].unique()) # 確保check中都是1

662
[1 0]


In [213]:
reason_dummies = reason_dummies.drop(['check'], axis=1)

In [214]:
reason_dummies

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,18,19,21,22,23,24,25,26,27,28
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,True,False,False,False,False,False
3,False,False,False,False,False,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,True,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,False,False,False,False,False,False,False,False,False,True,...,False,False,False,False,False,False,False,False,False,False
696,False,False,False,False,False,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
697,False,False,False,False,False,False,False,False,False,True,...,False,False,False,False,False,False,False,False,False,False
698,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,True,False,False,False,False,False


### Group in the following way:
➢ Group 1: Columns 1 to 14

➢ Group 2: Columns 15, 16, and 17

➢ Group 3: Columns 18, 19, 20, and 21

➢ Group 4: Columns 22 to 28

In [215]:
reason1 = reason_dummies.loc[:, 1:14].max(axis=1)   # 總結第1類～第14類的reasons
reason2 = reason_dummies.loc[:, 15:17].max(axis=1)
reason3 = reason_dummies.loc[:, 18:21].max(axis=1)
reason4 = reason_dummies.loc[:, 22:28].max(axis=1)

In [216]:
reason1

0      False
1      False
2      False
3       True
4      False
       ...  
695     True
696     True
697     True
698    False
699    False
Length: 700, dtype: bool

In [217]:
# Concat Dataframe
df_absenteeism = pd.concat([df_absenteeism, reason1, reason2, reason3, reason4], axis=1)
df_absenteeism

Unnamed: 0,Reason for Absence,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,0,1,2,3
0,26,07/07/2015,289,36,33,239.554,30,1,2,1,4,False,False,False,True
1,0,14/07/2015,118,13,50,239.554,31,1,1,0,0,False,False,False,False
2,23,15/07/2015,179,51,38,239.554,31,1,0,0,2,False,False,False,True
3,7,16/07/2015,279,5,39,239.554,24,1,2,0,4,True,False,False,False
4,23,23/07/2015,289,36,33,239.554,30,1,2,1,2,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,10,23/05/2018,179,22,40,237.656,22,2,2,0,8,True,False,False,False
696,6,23/05/2018,225,26,28,237.656,24,1,1,2,3,True,False,False,False
697,10,24/05/2018,330,16,28,237.656,25,2,0,0,8,True,False,False,False
698,23,24/05/2018,235,16,32,237.656,25,3,0,0,2,False,False,False,True


In [218]:
# Droup 'Reason for Absence'
df_absenteeism = df_absenteeism.drop('Reason for Absence', axis=1)
df_absenteeism

Unnamed: 0,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,0,1,2,3
0,07/07/2015,289,36,33,239.554,30,1,2,1,4,False,False,False,True
1,14/07/2015,118,13,50,239.554,31,1,1,0,0,False,False,False,False
2,15/07/2015,179,51,38,239.554,31,1,0,0,2,False,False,False,True
3,16/07/2015,279,5,39,239.554,24,1,2,0,4,True,False,False,False
4,23/07/2015,289,36,33,239.554,30,1,2,1,2,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,23/05/2018,179,22,40,237.656,22,2,2,0,8,True,False,False,False
696,23/05/2018,225,26,28,237.656,24,1,1,2,3,True,False,False,False
697,24/05/2018,330,16,28,237.656,25,2,0,0,8,True,False,False,False
698,24/05/2018,235,16,32,237.656,25,3,0,0,2,False,False,False,True


In [None]:
# Rename 0~3 columns: 如果欄位是數字就不用加上 ' '
df_absenteeism = df_absenteeism.rename(
    columns=
    {
        0:'Reason1',
        1:'Reason2',
        2:'Reason3',
        3:'Reason4'
    })
df_absenteeism

Unnamed: 0,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Reason1,Reason2,Reason3,Reason4
0,07/07/2015,289,36,33,239.554,30,1,2,1,4,False,False,False,True
1,14/07/2015,118,13,50,239.554,31,1,1,0,0,False,False,False,False
2,15/07/2015,179,51,38,239.554,31,1,0,0,2,False,False,False,True
3,16/07/2015,279,5,39,239.554,24,1,2,0,4,True,False,False,False
4,23/07/2015,289,36,33,239.554,30,1,2,1,2,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,23/05/2018,179,22,40,237.656,22,2,2,0,8,True,False,False,False
696,23/05/2018,225,26,28,237.656,24,1,1,2,3,True,False,False,False
697,24/05/2018,330,16,28,237.656,25,2,0,0,8,True,False,False,False
698,24/05/2018,235,16,32,237.656,25,3,0,0,2,False,False,False,True


In [226]:
# Reorder
df_absenteeism.columns

Index(['Reason1', 'Reason2', 'Reason3', 'Reason4', 'Date',
       'Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index', 'Education', 'Children',
       'Pets', 'Absenteeism Time in Hours'],
      dtype='object')

In [224]:
reorder = ['Reason1', 'Reason2', 'Reason3', 'Reason4', 'Date', 'Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index', 'Education', 'Children',
       'Pets', 'Absenteeism Time in Hours']

df_absenteeism = df_absenteeism[reorder]

df_absenteeism

Unnamed: 0,Reason1,Reason2,Reason3,Reason4,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,False,False,False,True,07/07/2015,289,36,33,239.554,30,1,2,1,4
1,False,False,False,False,14/07/2015,118,13,50,239.554,31,1,1,0,0
2,False,False,False,True,15/07/2015,179,51,38,239.554,31,1,0,0,2
3,True,False,False,False,16/07/2015,279,5,39,239.554,24,1,2,0,4
4,False,False,False,True,23/07/2015,289,36,33,239.554,30,1,2,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,True,False,False,False,23/05/2018,179,22,40,237.656,22,2,2,0,8
696,True,False,False,False,23/05/2018,225,26,28,237.656,24,1,1,2,3
697,True,False,False,False,24/05/2018,330,16,28,237.656,25,2,0,0,8
698,False,False,False,True,24/05/2018,235,16,32,237.656,25,3,0,0,2


### Create a checkpoint

In [242]:
df_reason_modify = df_absenteeism.copy()
df_reason_modify.head()

Unnamed: 0,Reason1,Reason2,Reason3,Reason4,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,False,False,False,True,07/07/2015,289,36,33,239.554,30,1,2,1,4
1,False,False,False,False,14/07/2015,118,13,50,239.554,31,1,1,0,0
2,False,False,False,True,15/07/2015,179,51,38,239.554,31,1,0,0,2
3,True,False,False,False,16/07/2015,279,5,39,239.554,24,1,2,0,4
4,False,False,False,True,23/07/2015,289,36,33,239.554,30,1,2,1,2


### Extract the month value and the day of the week from the ‘Date’ column. Then, drop the ‘Date’ column as well.

In [249]:
df_reason_modify['Date'] = pd.to_datetime(df_reason_modify['Date'], format='%d/%m/%Y')
df_reason_modify['Date']

0     2015-07-07
1     2015-07-14
2     2015-07-15
3     2015-07-16
4     2015-07-23
         ...    
695   2018-05-23
696   2018-05-23
697   2018-05-24
698   2018-05-24
699   2018-05-31
Name: Date, Length: 700, dtype: datetime64[ns]

In [None]:
# Extract Month
month_list = []

for i in range(df_reason_modify.shape[0]):
    month_list.append(df_reason_modify['Date'][i].month)

month_list

[7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 7,
 

In [257]:
df_reason_modify['Month'] = month_list
df_reason_modify

Unnamed: 0,Reason1,Reason2,Reason3,Reason4,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Month
0,False,False,False,True,2015-07-07,289,36,33,239.554,30,1,2,1,4,7
1,False,False,False,False,2015-07-14,118,13,50,239.554,31,1,1,0,0,7
2,False,False,False,True,2015-07-15,179,51,38,239.554,31,1,0,0,2,7
3,True,False,False,False,2015-07-16,279,5,39,239.554,24,1,2,0,4,7
4,False,False,False,True,2015-07-23,289,36,33,239.554,30,1,2,1,2,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,True,False,False,False,2018-05-23,179,22,40,237.656,22,2,2,0,8,5
696,True,False,False,False,2018-05-23,225,26,28,237.656,24,1,1,2,3,5
697,True,False,False,False,2018-05-24,330,16,28,237.656,25,2,0,0,8,5
698,False,False,False,True,2018-05-24,235,16,32,237.656,25,3,0,0,2,5


In [262]:
# Extract the day of the week
day_list = []

for i in range(df_reason_modify.shape[0]):
    day_list.append(df_reason_modify['Date'][i].weekday())
    
day_list   

[1,
 1,
 2,
 3,
 3,
 4,
 4,
 4,
 0,
 0,
 0,
 1,
 2,
 2,
 2,
 4,
 4,
 0,
 3,
 2,
 2,
 0,
 0,
 4,
 0,
 0,
 1,
 2,
 2,
 4,
 0,
 3,
 3,
 0,
 0,
 0,
 1,
 3,
 4,
 4,
 1,
 0,
 1,
 1,
 2,
 6,
 0,
 3,
 4,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 2,
 2,
 2,
 4,
 4,
 4,
 1,
 1,
 2,
 3,
 4,
 4,
 1,
 2,
 2,
 3,
 2,
 2,
 3,
 1,
 1,
 2,
 4,
 4,
 3,
 2,
 3,
 3,
 3,
 0,
 0,
 0,
 2,
 4,
 2,
 2,
 4,
 4,
 0,
 1,
 2,
 3,
 1,
 1,
 2,
 2,
 3,
 4,
 1,
 2,
 3,
 4,
 1,
 2,
 4,
 4,
 4,
 2,
 0,
 1,
 1,
 2,
 3,
 3,
 4,
 0,
 1,
 1,
 2,
 3,
 4,
 0,
 1,
 1,
 1,
 2,
 3,
 3,
 0,
 1,
 3,
 4,
 2,
 2,
 3,
 3,
 4,
 4,
 0,
 1,
 0,
 1,
 1,
 1,
 3,
 0,
 1,
 2,
 3,
 1,
 1,
 1,
 1,
 2,
 2,
 3,
 4,
 4,
 0,
 0,
 0,
 1,
 3,
 3,
 4,
 4,
 0,
 1,
 2,
 0,
 0,
 1,
 2,
 2,
 3,
 4,
 4,
 0,
 2,
 2,
 1,
 2,
 3,
 4,
 4,
 2,
 3,
 4,
 4,
 4,
 4,
 4,
 1,
 2,
 2,
 3,
 4,
 0,
 2,
 3,
 0,
 1,
 1,
 2,
 0,
 1,
 2,
 2,
 2,
 2,
 2,
 1,
 2,
 4,
 0,
 0,
 1,
 3,
 4,
 0,
 0,
 0,
 2,
 4,
 1,
 2,
 3,
 4,
 0,
 3,
 1,
 2,
 4,
 4,
 1,
 3,
 4,
 0,
 1,
 0,
 1,
 2,
 3,
 0,


In [263]:
df_reason_modify['Day of the week'] = day_list
df_reason_modify.head()

Unnamed: 0,Reason1,Reason2,Reason3,Reason4,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Month,Day of the week
0,False,False,False,True,2015-07-07,289,36,33,239.554,30,1,2,1,4,7,1
1,False,False,False,False,2015-07-14,118,13,50,239.554,31,1,1,0,0,7,1
2,False,False,False,True,2015-07-15,179,51,38,239.554,31,1,0,0,2,7,2
3,True,False,False,False,2015-07-16,279,5,39,239.554,24,1,2,0,4,7,3
4,False,False,False,True,2015-07-23,289,36,33,239.554,30,1,2,1,2,7,3


In [264]:
df_reason_modify = df_reason_modify.drop('Date', axis=1)
df_reason_modify

Unnamed: 0,Reason1,Reason2,Reason3,Reason4,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Month,Day of the week
0,False,False,False,True,289,36,33,239.554,30,1,2,1,4,7,1
1,False,False,False,False,118,13,50,239.554,31,1,1,0,0,7,1
2,False,False,False,True,179,51,38,239.554,31,1,0,0,2,7,2
3,True,False,False,False,279,5,39,239.554,24,1,2,0,4,7,3
4,False,False,False,True,289,36,33,239.554,30,1,2,1,2,7,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,True,False,False,False,179,22,40,237.656,22,2,2,0,8,5,2
696,True,False,False,False,225,26,28,237.656,24,1,1,2,3,5,2
697,True,False,False,False,330,16,28,237.656,25,2,0,0,8,5,3
698,False,False,False,True,235,16,32,237.656,25,3,0,0,2,5,3


In [265]:
df_reason_modify.columns

Index(['Reason1', 'Reason2', 'Reason3', 'Reason4', 'Transportation Expense',
       'Distance to Work', 'Age', 'Daily Work Load Average', 'Body Mass Index',
       'Education', 'Children', 'Pets', 'Absenteeism Time in Hours', 'Month',
       'Day of the week'],
      dtype='object')

In [276]:
# Reorder
reorder_month_day = ['Reason1', 'Reason2', 'Reason3', 'Reason4','Month',
       'Day of the week', 'Transportation Expense',
       'Distance to Work', 'Age', 'Daily Work Load Average', 'Body Mass Index',
       'Education', 'Children', 'Pets', 'Absenteeism Time in Hours']
df_reason_modify = df_reason_modify[reorder_month_day]

### Check Point


In [278]:
df_dm = df_reason_modify.copy()
df_dm

Unnamed: 0,Reason1,Reason2,Reason3,Reason4,Month,Day of the week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,False,False,False,True,7,1,289,36,33,239.554,30,1,2,1,4
1,False,False,False,False,7,1,118,13,50,239.554,31,1,1,0,0
2,False,False,False,True,7,2,179,51,38,239.554,31,1,0,0,2
3,True,False,False,False,7,3,279,5,39,239.554,24,1,2,0,4
4,False,False,False,True,7,3,289,36,33,239.554,30,1,2,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,True,False,False,False,5,2,179,22,40,237.656,22,2,2,0,8
696,True,False,False,False,5,2,225,26,28,237.656,24,1,1,2,3
697,True,False,False,False,5,3,330,16,28,237.656,25,2,0,0,8
698,False,False,False,True,5,3,235,16,32,237.656,25,3,0,0,2


### Turn the data from the ‘Education’ column into binary data, by mapping the value of 0 to the values of 1, and the value of 1 to the rest of the values found in this column.

In [279]:
df_dm['Education'].unique()

array([1, 3, 2, 4])

In [280]:
df_dm['Education'].value_counts()

Education
1    583
3     73
2     40
4      4
Name: count, dtype: int64

In [281]:
df_dm['Education'] = df_dm['Education'].map({1:0, 2:1, 3:1, 4:1})
df_dm['Education']

0      0
1      0
2      0
3      0
4      0
      ..
695    1
696    0
697    1
698    1
699    0
Name: Education, Length: 700, dtype: int64

In [282]:
# check
# df_dm['Education'].unique()
df_dm['Education'].value_counts()

Education
0    583
1    117
Name: count, dtype: int64

### Final Check

In [283]:
df_processed = df_dm.copy()
df_processed.head()

Unnamed: 0,Reason1,Reason2,Reason3,Reason4,Month,Day of the week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,False,False,False,True,7,1,289,36,33,239.554,30,0,2,1,4
1,False,False,False,False,7,1,118,13,50,239.554,31,0,1,0,0
2,False,False,False,True,7,2,179,51,38,239.554,31,0,0,0,2
3,True,False,False,False,7,3,279,5,39,239.554,24,0,2,0,4
4,False,False,False,True,7,3,289,36,33,239.554,30,0,2,1,2


In [285]:
# Store csv file
path = '/Users/edwina/Desktop/DataScience(Python)/DeepLearning/CaseStudy/'
df_processed.to_csv(path+'Absenteeism_preprocessed.csv', index=False)