In [84]:
import pandas as pd
import plotly.express as px
import cbsodata


In [85]:
# Fetch the specific dataset using its ID
data = pd.DataFrame(cbsodata.get_data('80072NED'))

In [86]:
# Display the first few rows of the dataset
print(data.head())

   ID           BedrijfskenmerkenSBI2008          Perioden  \
0   0  A-U Alle economische activiteiten  1996 1e kwartaal   
1   1  A-U Alle economische activiteiten  1996 2e kwartaal   
2   2  A-U Alle economische activiteiten  1996 3e kwartaal   
3   3  A-U Alle economische activiteiten  1996 4e kwartaal   
4   4  A-U Alle economische activiteiten              1996   

   Ziekteverzuimpercentage_1  
0                        5.5  
1                        4.6  
2                        4.0  
3                        4.7  
4                        4.7  


In [87]:
# Extract year and quarter from the 'Perioden' column
data['Year'] = data['Perioden'].str.extract(r'(\d{4})').astype(float)
data['Quarter'] = data['Perioden'].str.extract(r'(\d)e kwartaal').astype(float)


In [88]:
# Drop rows where Year or Quarter is NaN (e.g., full year periods)
data = data.dropna(subset=['Year', 'Quarter'])

In [89]:
# Convert Year and Quarter to integers
data['Year'] = data['Year'].astype(int)
data['Quarter'] = data['Quarter'].astype(int)

In [90]:
# Filter data to include only years from 2016 to 2023
data = data[(data['Year'] >= 2016) & (data['Year'] <= 2023)]

In [91]:
# Aggregate the data to handle duplicates
agg_data = data.groupby(['Quarter', 'Year']).agg({'Ziekteverzuimpercentage_1': 'mean'}).reset_index()

In [92]:
# Pivot the table to get years as columns and quarters as rows
pivot_data = agg_data.pivot(index='Quarter', values='Ziekteverzuimpercentage_1', columns='Year')

In [93]:
# Sort columns in descending order
pivot_data = pivot_data[sorted(pivot_data.columns, reverse=True)]

In [94]:
# Set a pastel color palette
#sns.set_palette("colorblind")

In [95]:
# Get the current color palette
#colors = sns.color_palette("colorblind", n_colors=len(pivot_data.columns))

In [96]:
# Extract year and quarter from the 'Perioden' column
data['Year'] = data['Perioden'].str.extract(r'(\d{4})').astype(float)
data['Quarter'] = data['Perioden'].str.extract(r'(\d)e kwartaal').astype(float)

# Drop rows where Year or Quarter is NaN (e.g., full year periods)
data = data.dropna(subset=['Year', 'Quarter'])

# Convert Year and Quarter to integers
data['Year'] = data['Year'].astype(int)
data['Quarter'] = data['Quarter'].astype(int)

# Filter data to include only years from 2016 to 2023
data = data[(data['Year'] >= 2016) & (data['Year'] <= 2023)]

# Aggregate the data to handle duplicates
agg_data = data.groupby(['Quarter', 'Year']).agg({'Ziekteverzuimpercentage_1': 'mean'}).reset_index()

# Pivot the table to get years as columns and quarters as rows
pivot_data = agg_data.pivot(index='Quarter', values='Ziekteverzuimpercentage_1', columns='Year')

# Melt the pivoted data for Plotly
melted_data = pivot_data.reset_index().melt(id_vars='Quarter', var_name='Year', value_name='Sick Leave Percentage')

# Create the Plotly line chart
fig = px.line(melted_data, 
              x='Quarter', 
              y='Sick Leave Percentage', 
              color='Year',
              title='Sick Leave Insights: A Yearly and Quarterly Breakdown',
              markers=True)

# Adjust the x-axis to only show Q1, Q2, Q3, and Q4
fig.update_layout(
    template='ggplot2', 
    xaxis_title='Quarter',
    xaxis=dict(
        tickmode='array',  # Use custom ticks
        tickvals=[1, 2, 3, 4],  # Show only full quarters
        ticktext=['Q1', 'Q2', 'Q3', 'Q4'],  # Custom labels for quarters
        showgrid=True,
        gridcolor='lightgrey',
    ),
    yaxis=dict(
        tickmode='array', 
        tickvals=[3, 4, 5, 6, 7],  # Custom tick values without halves
        ticktext=['0', '4', '5', '6', '7'],  # Label 3.5 as "0"
        range=[3, agg_data['Ziekteverzuimpercentage_1'].max() + 1],  # Y-axis starts just below 3.5
        showgrid=True,
        gridcolor='grey',
        title=None  # Hide the default y-axis title
    ),
    plot_bgcolor='white',  # Set background to white
    paper_bgcolor='white',  # Set plot area to white
    showlegend=False,  # Remove the legend
    width=1100,  # Set the width of the plot
    height=600,  # Set the height of the plot
    font=dict(family='Roboto', size=12, color='black', weight='bold'),  # Set Roboto font for entire plot
)


# Add custom annotation to act as the y-axis title at the top-left of the plot
fig.add_annotation(
    text='CBS Sick Leave %',  # Custom title text
    xref='paper', yref='paper',  # Position in plot space
    x=-0.06, y=1.00,  # Top-left corner of the plot
    showarrow=False,
    font=dict(size=12),  # Customize the font size
    xanchor='left',
    yanchor='top',
)

# Add year labels at the last point of each line with matching colors
for year in melted_data['Year'].dropna().unique():
    last_quarter = melted_data[melted_data['Year'] == year]['Quarter'].max()
    last_value = melted_data[(melted_data['Year'] == year) & (melted_data['Quarter'] == last_quarter)]['Sick Leave Percentage'].values[0]
    
    # Get the line color associated with the year
    line_color = fig.data[melted_data['Year'].dropna().unique().tolist().index(year)].line.color
    
    fig.add_annotation(
        x=last_quarter,
        y=last_value,
        text=str(year),
        showarrow=False,
        font=dict(color=line_color, size=12),  # Match the text color with the line
        xanchor='left',
        yanchor='middle'
    )

# Show the plot
fig.show()

# Save the plot as a PNG file
fig.write_image("sickleave_percentage_per_quarter_custom_ticks.png")

# Save as HTML for interactive viewing
fig.write_html("sickleave_percentage_per_quarter_custom_ticks.html")
