In [33]:
import numpy as np
import pandas as pd
from scipy import stats

from bokeh.models import ColumnDataSource, CategoricalColorMapper
from bokeh.plotting import figure, show
from bokeh.io import output_notebook
from bokeh.layouts import column, row

from sklearn import svm, metrics, linear_model
from sklearn.model_selection import train_test_split

In [2]:
data = pd.read_csv('Absenteeism_at_work_AAA/Absenteeism_at_work.csv', sep=';')

Attribute Information:

1. Individual identification (ID)
2. Reason for absence (ICD).
Absences attested by the International Code of Diseases (ICD) stratified into 21 categories (I to XXI) as follows:

I Certain infectious and parasitic diseases  
II Neoplasms  
III Diseases of the blood and blood-forming organs and certain disorders involving the immune mechanism  
IV Endocrine, nutritional and metabolic diseases  
V Mental and behavioural disorders  
VI Diseases of the nervous system  
VII Diseases of the eye and adnexa  
VIII Diseases of the ear and mastoid process  
IX Diseases of the circulatory system  
X Diseases of the respiratory system  
XI Diseases of the digestive system  
XII Diseases of the skin and subcutaneous tissue  
XIII Diseases of the musculoskeletal system and connective tissue  
XIV Diseases of the genitourinary system  
XV Pregnancy, childbirth and the puerperium  
XVI Certain conditions originating in the perinatal period  
XVII Congenital malformations, deformations and chromosomal abnormalities  
XVIII Symptoms, signs and abnormal clinical and laboratory findings, not elsewhere classified  
XIX Injury, poisoning and certain other consequences of external causes  
XX External causes of morbidity and mortality  
XXI Factors influencing health status and contact with health services.

And 7 categories without (CID) patient follow-up (22), medical consultation (23), blood donation (24), laboratory examination (25), unjustified absence (26), physiotherapy (27), dental consultation (28).
3. Month of absence
4. Day of the week (Monday (2), Tuesday (3), Wednesday (4), Thursday (5), Friday (6))
5. Seasons
6. Transportation expense
7. Distance from Residence to Work (kilometers)
8. Service time
9. Age
10. Work load Average/day 
11. Hit target
12. Disciplinary failure (yes=1; no=0)
13. Education (high school (1), graduate (2), postgraduate (3), master and doctor (4))
14. Son (number of children)
15. Social drinker (yes=1; no=0)
16. Social smoker (yes=1; no=0)
17. Pet (number of pet)
18. Weight
19. Height
20. Body mass index
21. Absenteeism time in hours (target)

In [3]:
print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 740 entries, 0 to 739
Data columns (total 21 columns):
ID                                 740 non-null int64
Reason for absence                 740 non-null int64
Month of absence                   740 non-null int64
Day of the week                    740 non-null int64
Seasons                            740 non-null int64
Transportation expense             740 non-null int64
Distance from Residence to Work    740 non-null int64
Service time                       740 non-null int64
Age                                740 non-null int64
Work load Average/day              740 non-null float64
Hit target                         740 non-null int64
Disciplinary failure               740 non-null int64
Education                          740 non-null int64
Son                                740 non-null int64
Social drinker                     740 non-null int64
Social smoker                      740 non-null int64
Pet                          

In [25]:
new_cols = list()
for col in data.columns:
    words = col.lower().split()
    new_cols.append('_'.join(words))

data.columns = new_cols

In [26]:
data.head(5)

Unnamed: 0,id,reason_for_absence,month_of_absence,day_of_the_week,seasons,transportation_expense,distance_from_residence_to_work,service_time,age,work_load_average/day,...,disciplinary_failure,education,son,social_drinker,social_smoker,pet,weight,height,body_mass_index,absenteeism_time_in_hours
0,11,26,7,3,1,289,36,13,33,239.554,...,0,1,2,1,0,1,90,172,30,4
1,36,0,7,3,1,118,13,18,50,239.554,...,1,1,1,1,0,0,98,178,31,0
2,3,23,7,4,1,179,51,18,38,239.554,...,0,1,0,1,0,0,89,170,31,2
3,7,7,7,5,1,279,5,14,39,239.554,...,0,1,2,1,1,0,68,168,24,4
4,11,23,7,5,1,289,36,13,33,239.554,...,0,1,2,1,0,1,90,172,30,2


In [5]:
output_notebook()

In [55]:
reason_counts = data.groupby(['reason_for_absence'])[['reason_for_absence']].count()
reason_counts.columns = ['count']
print(reason_counts.head())

abs_hour_reason = data.groupby(['reason_for_absence'])[['absenteeism_time_in_hours']]

                    count
reason_for_absence       
0                      43
1                      16
2                       1
3                       1
4                       2


<pandas.core.groupby.DataFrameGroupBy object at 0x10ff8a3c8>

In [48]:
abs_hist, edges = np.histogram(data['absenteeism_time_in_hours'], bins=20, density=True)

p1 = figure(title='Absenteeism distribution', plot_height=400, plot_width=400, tools=[])
p1.quad(top=abs_hist, bottom=0, left=edges[:-1], right=edges[1:], fill_alpha=0, line_color='green')

source = ColumnDataSource(reason_counts)
tools = ['hover']

p2 = figure(title='Reason counts', plot_height=400, plot_width=400, tools=tools)
p2.vbar(x='reason_for_absence', top='count', width=0.9, source=source)

layout = row(p1, p2)

show(layout)