In [None]:
from uwv.config import CBS_OPENDATA_PROCESSED_DATA_DIR, CBS80072NED, OUTPUT_DIR

import pandas as pd
import altair as alt

cbs80072ned = pd.read_parquet(CBS_OPENDATA_PROCESSED_DATA_DIR / f"{CBS80072NED}.parquet")

cbs80072ned

In [None]:
translations = {
    'A Landbouw, bosbouw en visserij': 'A Agriculture, forestry and fishing',
    'B Delfstoffenwinning': 'B Mining and quarrying',
    'C Industrie': 'C Manufacturing',
    'D Energievoorziening': 'D Energy supply',
    'E Waterbedrijven en afvalbeheer': 'E Water utilities and waste management',
    'F Bouwnijverheid': 'F Construction',
    'G Handel': 'G Trade',
    'H Vervoer en opslag': 'H Transportation and storage',
    'I Horeca': 'I Hospitality',
    'J Informatie en communicatie': 'J Information and communication',
    'K Financiële dienstverlening': 'K Financial services',
    'L Verhuur en handel van onroerend goed': 'L Rental and trade of real estate',
    'M Specialistische zakelijke diensten': 'M Special business services',
    'N Verhuur en overige zakelijke diensten': 'N Renting and other business services',
    'O Openbaar bestuur en overheidsdiensten': 'O Public administration and service',
    'P Onderwijs': 'P Education',
    'Q Gezondheids- en welzijnszorg': 'Q Human health and social work activities',
    'R Cultuur, sport en recreatie': 'R Culture, sports and recreation',
    'S Overige dienstverlening': 'S Other service activities',
    '1 tot 10 werkzame personen': '1 to 10 employed persons',
    '10 tot 100 werkzame personen': '10 to 100 employed persons',
    '100 of meer werkzame personen': '100 or more employed persons'
}

In [None]:
cbs80072ned['sbi_title_english'] = cbs80072ned['sbi_title'].map(translations)

In [None]:
cbs80072ned

In [None]:
cbs80072ned.info()

In [None]:
cbs80072ned['sbi_title_english'] = cbs80072ned['sbi_title_english'].astype('category')

In [None]:
cbs80072ned.info()

In [None]:
quarters = cbs80072ned['period_type'].str.contains('KW')
cbs80072ned_quarters = cbs80072ned[quarters]

branches = cbs80072ned_quarters['category_group_title'].str.contains('Bedrijfstak') | cbs80072ned_quarters['sbi_title'].str.contains('A Landbouw, bosbouw en visserij')
cbs80072ned_quarters_branches = cbs80072ned_quarters[branches]

data2023_quartes = cbs80072ned_quarters_branches['period_title'].str.contains('2023') | cbs80072ned_quarters_branches['period_title'].str.contains('2024')
cbs80072ned_quarters_branches_tot2023 = cbs80072ned_quarters_branches[~data2023_quartes]

In [None]:
alt.Chart(data=cbs80072ned_quarters_branches_tot2023).mark_trail().encode(
    x=alt.X('period', axis = alt.Axis(title='Year and quarter', labelAngle=310)),
    y=alt.Y('sick_leave_percentage', axis = alt.Axis(title='Sick leave percentage')), 
    color=alt.Color('sbi_title', legend=alt.Legend(title=None)),
    tooltip=['period', 'sick_leave_percentage', 'sbi_title']
    ).properties(width=1200, height=600, title='Ziekteverzuimpercentages per branche per kwartaal (1996-2022)')

In [None]:
alt.Chart(data=cbs80072ned_quarters_branches_tot2023).mark_trail().encode(
    x=alt.X('period', axis = alt.Axis(title='Year and quarter', labelAngle=310)),
    y=alt.Y('sick_leave_percentage', axis = alt.Axis(title='Sick leave percentage')), 
    color=alt.Color('sbi_title_english', legend=alt.Legend(title=None)),
    tooltip=['period', 'sick_leave_percentage', 'sbi_title_english']
    ).properties(width=1200, height=600, title='Sick leave percentages per branch per quarter (1996-2022)')

In [None]:
cbs80072ned_years = cbs80072ned[~quarters]

branches = cbs80072ned_years['category_group_title'].str.contains('Bedrijfstak') | cbs80072ned_years['sbi_title'].str.contains('A Landbouw, bosbouw en visserij')
cbs80072ned_years_branches = cbs80072ned_years[branches]

data2023_years = cbs80072ned_years_branches['period_title'].str.contains('2023') | cbs80072ned_years_branches['period_title'].str.contains('2024')
cbs80072ned_years_branches_tot2023 = cbs80072ned_years_branches[~data2023_years]

In [None]:
alt.Chart(data=cbs80072ned_years_branches_tot2023).mark_trail().encode(
    x=alt.X('period_year', axis = alt.Axis(format='d', title='Year')),
    y=alt.Y('sick_leave_percentage', axis = alt.Axis(title='Sick leave percentage')), 
    color=alt.Color('sbi_title', legend=alt.Legend(title=None)),
    tooltip=['period_year', 'sick_leave_percentage', 'sbi_title']
    ).properties(width=1200, height=600, title='Ziekteverzuimpercentages per branche per jaar (1996-2022)')

In [None]:
alt.Chart(data=cbs80072ned_years_branches_tot2023).mark_trail().encode(
    x=alt.X('period_year', axis = alt.Axis(format='d', title='Year')),
    y=alt.Y('sick_leave_percentage', axis = alt.Axis(title='Sick leave percentage')), 
    color=alt.Color('sbi_title_english', legend=alt.Legend(title=None)),
    tooltip=['period_year', 'sick_leave_percentage', 'sbi_title_english']
    ).properties(width=1200, height=600, title='Sick leave percentages per branch per year (1996-2022)')

In [None]:
employees_quarters = cbs80072ned_quarters['category_group_title'].str.contains('Bedrijfsgrootte')
cbs80072ned_quarters_employees = cbs80072ned_quarters[employees_quarters]

data2023_employees_quarters = cbs80072ned_quarters_employees['period_title'].str.contains('2023') | cbs80072ned_quarters_employees['period_title'].str.contains('2024')
cbs80072ned_quarters_employees_tot2023 = cbs80072ned_quarters_employees[~data2023_employees_quarters]

In [None]:
alt.Chart(data=cbs80072ned_quarters_employees_tot2023).mark_trail().encode(
    x=alt.X('period', axis = alt.Axis(labelAngle=310)),
    y='sick_leave_percentage', 
    color=alt.Color('sbi_title', legend=alt.Legend(title=None)),
    tooltip=['period', 'sick_leave_percentage', 'sbi_title']
    ).properties(width=1200, height=600, title='Ziekteverzuimpercentages per bedrijfsgrootte per kwartaal (1996-2022)')

In [None]:
alt.Chart(data=cbs80072ned_quarters_employees_tot2023).mark_trail().encode(
    x=alt.X('period', axis = alt.Axis(labelAngle=310)),
    y='sick_leave_percentage', 
    color=alt.Color('sbi_title_english', legend=alt.Legend(title=None)),
    tooltip=['period', 'sick_leave_percentage', 'sbi_title_english']
    ).properties(width=1200, height=600, title='Sick leave percentages per company size per quarter (1996-2022)')

In [None]:
employees_years = cbs80072ned_years['category_group_title'].str.contains('Bedrijfsgrootte')
cbs80072ned_years_employees = cbs80072ned_years[employees_years]

data2023_employees_years = cbs80072ned_years_employees['period_title'].str.contains('2023') | cbs80072ned_years_employees['period_title'].str.contains('2024')
cbs80072ned_years_employees_tot2023 = cbs80072ned_years_employees[~data2023_employees_years]

In [None]:
alt.Chart(data=cbs80072ned_years_employees_tot2023).mark_trail().encode(
    x=alt.X('period_year', axis = alt.Axis(format='d', title='Year')),
    y=alt.Y('sick_leave_percentage', axis = alt.Axis(title='Sick leave percentage')),
    color=alt.Color('sbi_title', legend=alt.Legend(title=None)),
    tooltip=['period_year', 'sick_leave_percentage', 'sbi_title'],
).properties(width=1200, height=600, title='Ziekteverzuimpercentages per bedrijfsgrootte per jaar (1996-2022)')

In [None]:
alt.Chart(data=cbs80072ned_years_employees_tot2023).mark_trail().encode(
    x=alt.X('period_year', axis = alt.Axis(format='d', title='Year')),
    y=alt.Y('sick_leave_percentage', axis = alt.Axis(title='Sick leave percentage')),
    color=alt.Color('sbi_title_english', legend=alt.Legend(title=None)),
    tooltip=['period_year', 'sick_leave_percentage', 'sbi_title_english'],
).properties(width=1200, height=600, title='Sick leave percentages per company size per year (1996-2022)')

In [None]:
branche_medians = cbs80072ned_quarters_branches_tot2023.groupby('sbi_title', observed=True)['sick_leave_percentage'].median().sort_values(ascending=True)

order = branche_medians.index.tolist()

In [None]:
branche_medians

In [None]:
order

In [None]:
colors = [
    '#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', 
    '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf', 
    '#aec7e8', '#ffbb78', '#98df8a', '#ff9896', '#c5b0d5', 
    '#c49c94', '#f7b6d2', '#c7c7c7', '#dbdb8d'
]

alt.Chart(data=cbs80072ned_quarters_branches_tot2023).mark_boxplot().encode(
    x=alt.X('sick_leave_percentage', title='Sick leave percentage'), 
    y=alt.Y('sbi_title', sort=order, title='Branche'),
    color=alt.Color('sbi_title', legend=None, sort=order, scale=alt.Scale(range=colors)),
    ).properties(title='Boxplot met ziekteverzuimpercentages per kwartaal (1996-2022) gesorteerd op de mediaan')

In [None]:
branche_medians_english = cbs80072ned_quarters_branches_tot2023.groupby('sbi_title_english', observed=True)['sick_leave_percentage'].median().sort_values(ascending=True)

order_english = branche_medians_english.index.tolist()

In [None]:
branche_medians_english

In [None]:
order_english

In [None]:
alt.Chart(data=cbs80072ned_quarters_branches_tot2023).mark_boxplot().encode(
    x=alt.X('sick_leave_percentage', title='Sick leave percentage'), 
    y=alt.Y('sbi_title_english', sort=order_english, title='Branch'),
    color=alt.Color('sbi_title_english', legend=None, sort=order_english, scale=alt.Scale(range=colors)),
    ).properties(title='Boxplot with quarterly sick leave percentages (1996-2022) sorted on median values')

In [None]:
from uwv.config import CBS_OPENDATA_PROCESSED_DATA_DIR, CBS80072NED, OUTPUT_DIR, CBS_OPENDATA_EXTERNAL_DATA_DIR

import pandas as pd

cbscsv = pd.read_csv(CBS_OPENDATA_EXTERNAL_DATA_DIR / CBS80072NED / f"{CBS80072NED}_UntypedDataSet.csv")

cbscsv

In [None]:
cbswerkzamepersonen_2023 = pd.read_csv(r'C:\Users\mth2\OneDrive - Gemeente Breda\Bureaublad\Github\uwv\data\Werkzamepersonenperbedrijfstak_2023.csv', delimiter=';')

cbswerkzamepersonen_2023

In [None]:
cbs80072ned_2022 = cbs80072ned[cbs80072ned['period_title']=="2022"]
cbs80072ned_2022

In [None]:
cbs80072ned_2022_bedrijfstakken = cbs80072ned_2022[(cbs80072ned_2022['category_group_title']=="Bedrijfstak") | (cbs80072ned_2022['sbi_title']=="A Landbouw, bosbouw en visserij")]
cbs80072ned_2022_bedrijfstakken

In [None]:
cbs80072ned_2022_bedrijfstakken_drop = cbs80072ned_2022_bedrijfstakken.drop(columns=['id', 'sbi', 'period', 'period_title', 'period_status', 'period_year', 'period_type', 'period_quarter_number', 'period_quarter', 'sbi_description', 'category_group_id', 'category_group_title', 'sbi_title_english'])
cbs80072ned_2022_bedrijfstakken_drop

In [None]:
cbs80072ned_2022_bedrijfstakken_drop_renamed = cbs80072ned_2022_bedrijfstakken_drop.rename(columns={'sick_leave_percentage': 'Ziekteverzuimpercentage_2022', 'sbi_title': 'Bedrijfstakken/branches (SBI 2008)'})
cbs80072ned_2022_bedrijfstakken_drop_renamed

In [None]:
merged = pd.merge(cbswerkzamepersonen_2023, cbs80072ned_2022_bedrijfstakken_drop_renamed, on='Bedrijfstakken/branches (SBI 2008)', how='inner')
merged

In [None]:
merged = pd.merge(cbswerkzamepersonen_2023, cbs80072ned_2022_bedrijfstakken_drop_renamed, on='Bedrijfstakken/branches (SBI 2008)', how='inner')
merged.sort_values(by='Ziekteverzuimpercentage_2022', ascending=False)

In [None]:
merged = pd.merge(cbswerkzamepersonen_2023, cbs80072ned_2022_bedrijfstakken_drop_renamed, on='Bedrijfstakken/branches (SBI 2008)', how='inner')
merged.sort_values(by='Werkzamepersonen_2023(x1000)', ascending=False)