In [None]:

# Loading some libraries
import pandas as pd


#pd.options.display.max_columns = None
#pd.options.display.max_rows = None

# 1. Loading The Data

In [None]:
raw_csv_data = pd.read_csv('Absenteeism_data.csv')

In [None]:
raw_csv_data.head(5)

In [None]:
# We must always make a copy of our initial dataset
df = raw_csv_data.copy()

In [None]:
df.head(4)

In [None]:
df.info()

# 2. Data preprocessing

In [None]:
# Drop the ID columns because it is a nominal data and it is not carrying any information.

df = df.drop(['ID'], axis=1)


In [None]:
df.head(2)

### Analyzing The Reasons for Absence

In [None]:
# Analyzing The Reasons for Absence

df['Reason for Absence'].head(4)

In [None]:
df['Reason for Absence'].min()

In [None]:
df['Reason for Absence'].max()

In [None]:
pd.unique(df['Reason for Absence'])

In [None]:
# An other method
df['Reason for Absence'].unique()

In [None]:
len(df['Reason for Absence'].unique())

### Obtaining Dummies from a Single Feature

In [None]:
# Get Dummies
reason_columns = pd.get_dummies(df['Reason for Absence'])

In [None]:
reason_columns.head(5)

In [None]:
# Checking a missing values
reason_columns['check'] = reason_columns.sum(axis=1) 
# If we get 0 = missimg values, 1 = single value
reason_columns.head(5)

In [None]:
reason_columns['check'].sum(axis=0) 

In [None]:
reason_columns['check'].unique()

In [None]:
# Let's drop the check columns
reason_columns = reason_columns.drop(['check'], axis=1)
reason_columns.head(4)

In [None]:
# Drop Reason 0 to avoid potential multicollinearity issues
reason_columns = pd.get_dummies(df['Reason for Absence'], drop_first=True)
reason_columns.head(4)

### Group the Reasons for Absence

In [None]:
# Group the Reasons for Absence

df.columns.values

In [None]:
reason_columns.columns.values

In [None]:
# Step 1 : Drop the Reason for Absence from the df
df = df.drop(['Reason for Absence'], axis=1)
df.head(4)

In [None]:
# Step 2 : Grouping the variables

# Classifying the various Reasons for Absence

reason_type_1 = reason_columns.loc[:, 1:14].max(axis=1)
reason_type_2 = reason_columns.loc[:, 15:17].max(axis=1)
reason_type_3 = reason_columns.loc[:, 18:21].max(axis=1)
reason_type_4 = reason_columns.loc[:, 22:28].max(axis=1)

In [None]:
# Step 3 : Concatenate Columns Values

df = pd.concat([df, reason_type_1, reason_type_2, reason_type_3, reason_type_4], axis=1)
df.head(4)

In [None]:
df.columns.values

In [None]:
# Rename the colmns 0, 1, 2 and 3 to Reason_1, Reason_2, Reason_3, Reason_4

columns_names = ['Date', 'Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets', 'Absenteeism Time in Hours', 'Reason_1', 'Reason_2', 'Reason_3', 'Reason_4']

In [None]:
df.columns = columns_names

In [None]:
df.head(4)

### Reoder Columns

In [None]:
columns_names_reordered = ['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Date', 'Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets', 'Absenteeism Time in Hours']

In [None]:
df = df[columns_names_reordered]

In [None]:
df.head(4)

### Create Checkpoint

In [None]:
# Create a copy of the current state of the df DataFrame
df_reason_mod = df.copy()

In [None]:
df_reason_mod.head(3)

### Date Feature Preprocessing

In [None]:
df_reason_mod['Date'].head(4)

In [None]:
df_reason_mod['Date'] = pd.to_datetime(df_reason_mod['Date'], format='%d/%m/%Y')

In [None]:
df_reason_mod['Date'].head(4)

In [None]:
type(df_reason_mod['Date'][0])

In [None]:
df_reason_mod.info()

### Extract the Month Value

In [None]:
df_reason_mod['Date'][0]

In [None]:
df_reason_mod['Date'][0].month
#7 = July, Note that here, Months take values from 1 to 12

In [None]:
list_months = []
list_months

In [None]:
df_reason_mod.shape

In [None]:
''' for i in range(700):
    list_months.append(df_reason_mod['Date'][i].month)
     '''
for i in range(df_reason_mod.shape[0]):
    list_months.append(df_reason_mod['Date'][i].month)

In [None]:
list_months
len(list_months)

In [None]:
df_reason_mod['Montth Value'] = list_months

In [None]:
df_reason_mod.head(4)

### Extract the Day of the Week

In [None]:
df_reason_mod['Date'][699].weekday()

In [None]:
df_reason_mod['Date'][699] # 2018-05-31 was Thursday

In [None]:
def date_to_weekday(date_value):
    return date_value.weekday()

In [None]:
df_reason_mod['Day of the Week'] = df_reason_mod['Date'].apply(date_to_weekday)

In [None]:
df_reason_mod.head(4)

In [None]:
df_reason_data_mod = df_reason_mod.copy()

In [None]:
df_reason_data_mod.head(4)

### Analyzing Several Straightforward Columns for this projects

In [None]:
type(df_reason_data_mod['Transportation Expense'][0])

In [None]:
type(df_reason_data_mod['Distance to Work'][0])

In [None]:
type(df_reason_data_mod['Age'][0])

In [None]:
type(df_reason_data_mod['Daily Work Load Average'][0])

In [None]:
type(df_reason_data_mod['Body Mass Index'][0])

### Working on Education, Children and Pets 


In [None]:
# Educatio is categorical data and contains integers

# We have to transform Education into dummy variable

In [None]:
df_reason_data_mod['Education'].unique()
# 1 = high school
# 2 = graduate
# 3 = postgraduate
# 4 = a master or doctor

In [None]:
df_reason_data_mod['Education'].value_counts()
# 583 have high level, 
# 73 have graduate level
# 40 have postgraduate level
# 4 have a master or doctor

In [None]:
# Therefore, we can combine graduate, postgraduate and master or doctor in a single category
df_reason_data_mod['Education'] = df_reason_data_mod['Education'].map({
    1 : 0,
    2 : 1,
    3 : 1,
    4 : 1
})

In [None]:
df_reason_data_mod['Education'].unique()

In [None]:
df_reason_data_mod['Education'].value_counts()

### Final Checkpoint

In [None]:
df_preprocessed = df_reason_data_mod.copy()

In [None]:
df_preprocessed.head(10)

## Exporting The Data as a *.csv File

In [None]:
df_preprocessed.to_csv('Absenteeism_preprocessed.csv', index=False)