# Imports

In [88]:
# DATASET: https://archive.ics.uci.edu/ml/datasets/Absenteeism+at+work

import plotly.express as px
import plotly.io as pio
import pandas as pd

pio.templates.default = "seaborn"


# Loading and preprocessing

In [89]:

df = pd.read_csv("data/raw/absenteeism_at_work_raw.csv")

def rename_num_column(num_list, str_list):
    return {num:string for num, string in zip(num_list, str_list)}

new_months = rename_num_column([1,2,3,4,5,6,7,8,9,10,11,12, 0],["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec", "Unkown"])
new_days = rename_num_column([2,3,4,5,6], ["Mon", "Tue", "Wed", "Thr", "Fri"])
new_seasons = rename_num_column([1,2,3,4], ["Spring", "Summer", "Fall", "Winter"])
new_education = rename_num_column([1,2,3,4], ["h_school", "graduate", "postgrad", "master_phd"])
new_binary_yes_no = rename_num_column([0,1], ["no", "yes"])

df["Month of absence"] = df["Month of absence"] .apply(lambda x: new_months[x])
df["Day of the week"] = df["Day of the week"].apply(lambda x: new_days[x])
df["Seasons"] = df["Seasons"].apply(lambda x: new_seasons[x])
df["Education"] = df["Education"].apply(lambda x: new_education[x])
df["Disciplinary failure"] = df["Disciplinary failure"].apply(lambda x: new_binary_yes_no[x])
df["Social drinker"] = df["Social drinker"].apply(lambda x: new_binary_yes_no[x])
df["Social smoker"] = df["Social smoker"].apply(lambda x: new_binary_yes_no[x])

df.columns = df.columns.str.lower().str.replace(" ", "_")

The `reason_for_absence` column is interesting because it includes 21 categories. It is an encoding for the reason for absence as encoded by the International Code of Diseases (ICD). Details on the encodings can be found on the dataset link [here](https://archive.ics.uci.edu/ml/datasets/Absenteeism+at+work)

Since the column **also** contains numerical reasons outside of the ICD range, I will add a new column to the dataset called `disease` which will contain binary values (1 = yes, 0 = no) indicating whether the reason is a diease or not, respectively

In [90]:
def is_disease(num:int) -> str:
    return "yes" if num in range(1, 22) else "no"

df["disease"] = df["reason_for_absence"].apply(is_disease)

In [91]:
df.to_csv("data/preprocessed/absenteeism_at_work_preprocessed.csv")