# Imports

In [2]:
# DATASET: https://archive.ics.uci.edu/ml/datasets/Absenteeism+at+work

import plotly.express as px
import plotly.io as pio
import pandas as pd

pio.templates.default = "seaborn"


# Loading and preprocessing

Load raw data from csv

Since the categorical variables are encoded at integers, revert them back to strings according to [the dataset](https://archive.ics.uci.edu/ml/datasets/Absenteeism+at+work)

In [18]:

df = pd.read_csv("data/raw/absenteeism_at_work_raw.csv", index_col=False)

def rename_num_column_value(num_list, str_list) -> dict[int, str]:
    return {num:string for num, string in zip(num_list, str_list)}

new_months    = rename_num_column_value([1,2,3,4,5,6,7,8,9,10,11,12, 0],["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec", "Unkown"])
new_days      = rename_num_column_value([2,3,4,5,6], ["Mon", "Tue", "Wed", "Thr", "Fri"])
new_seasons   = rename_num_column_value([1,2,3,4], ["Spring", "Summer", "Fall", "Winter"])
new_education = rename_num_column_value([1,2,3,4], ["h_school", "graduate", "postgrad", "master_phd"])
yes_no        = rename_num_column_value([0,1], ["no", "yes"])

df["Month of absence"]     = df["Month of absence"] .apply(lambda x: new_months[x])
df["Day of the week"]      = df["Day of the week"].apply(lambda x: new_days[x])
df["Seasons"]              = df["Seasons"].apply(lambda x: new_seasons[x])
df["Education"]            = df["Education"].apply(lambda x: new_education[x])
df["Disciplinary failure"] = df["Disciplinary failure"].apply(lambda x: yes_no[x])
df["Social drinker"]       = df["Social drinker"].apply(lambda x: yes_no[x])
df["Social smoker"]        = df["Social smoker"].apply(lambda x: yes_no[x])


del df["ID"]
del df["Hit target"]
df.rename(columns={"Reason for absence":"Absence reason",
                   "Month of absence": "Month", 
                   "Day of the week":"Day",
                   "Distance from Residence to Work": "Distance to Work",
                   "Work load Average/day ":"Avg work load per day",
                   "Son":"Number of children",
                   "Absenteeism time in hours":"Hours absent",
                   "Pet": "Pets"},
          inplace=True)



The `reason_for_absence` column is interesting because it includes 21 categories. It is an encoding for the reason for absence as encoded by the International Code of Diseases (ICD). Details on the encodings can be found on the dataset link [here](https://archive.ics.uci.edu/ml/datasets/Absenteeism+at+work)

Since the column **also** contains numerical reasons outside of the ICD range, I will add a new column to the dataset called `disease` which will contain binary values (1 = yes, 0 = no) indicating whether the reason is a diease or not, respectively

In [19]:
def is_disease(num:int) -> str:
    return "yes" if num in range(1, 22) else "no"

df["Disease"] = df["Absence reason"].apply(is_disease)


Write preprocessed file to storage to be used in `main.py`

In [20]:
df.to_csv("data/preprocessed/absenteeism_at_work_preprocessed.csv",index=False)

# Testing cells

The following cells are just for testing code and do not contribute to the project

In [22]:
px.histogram(df, x="Age")

In [1]:
import json

def reason_mapping(query:int) -> str:
    """
    Loads in the reason_mapping json as a dict, converting the keys to integers
    Returns the ICD text entry for given IDC integer entry
    See README for details about the 'Absence reason` column
    """
    path="assets/reason_mapping.json"
    with open(path, "r") as fin:
        data = {int(num):reason for num, reason in json.load(fin).items()}
    return data[query]