In [1]:
import pandas as pd
from uwv.config import KNMI_PROCESSED_DATA_DIR, KNMI_AVG_TEMP, CBS_OPENDATA_PROCESSED_DATA_DIR, CBS80072NED
import plotly.graph_objects as go

# Set display options
pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.width', 1000)  # Set the width of the display

# Load your data
slp = pd.read_parquet(CBS_OPENDATA_PROCESSED_DATA_DIR / f"{CBS80072NED}.parquet")
knmi = pd.read_parquet(KNMI_PROCESSED_DATA_DIR / f"{KNMI_AVG_TEMP}.parquet")

# Merge data
sk = pd.merge(slp, knmi, on=['period_year', 'period_quarter_number'], how="inner")

# Create new columns
sk["slpx20"] = sk.sick_leave_percentage * 20
sk["quarter"] = sk.period_quarter_number

# Display the first few rows of the dataframe
print(sk.head())


[32m2024-12-13 15:33:53.245[0m | [1mINFO    [0m | [36muwv.config[0m:[36m<module>[0m:[36m11[0m - [1mPROJ_ROOT path is: C:\Users\c.hakker\OneDrive - VISTA college\Senior Stuff\Opleiding Data science\uwv[0m


   id      sbi    period  sick_leave_percentage      period_title period_status  period_year period_type  period_quarter_number  period_quarter                          sbi_title  category_group_id category_group_title  covid    avg_temp  slpx20  quarter
0   0  T001081  1996KW01                    5.5  1996 1e kwartaal    Definitief         1996          KW                      1           19961  A-U Alle economische activiteiten                  1               Totaal  False   12.333333   110.0        1
1   1  T001081  1996KW02                    4.6  1996 2e kwartaal    Definitief         1996          KW                      2           19962  A-U Alle economische activiteiten                  1               Totaal  False  119.333333    92.0        2
2   2  T001081  1996KW03                    4.0  1996 3e kwartaal    Definitief         1996          KW                      3           19963  A-U Alle economische activiteiten                  1               Totaal  False  153.6666

In [2]:
# Filter data for years 2008 and later, and for "A-U Alle economische activiteiten"
filtered_sk = sk[(sk['period_year'] >= 2008) & (sk['sbi_title'] == 'A-U Alle economische activiteiten')]

# Adjust avg_temp by dividing by 10
filtered_sk['avg_temp'] = filtered_sk['avg_temp'] / 10

# Calculate dynamic range for sick_leave_percentage
min_sick_leave = filtered_sk['sick_leave_percentage'].min() - 0.005  # Add buffer for lower range
max_sick_leave = filtered_sk['sick_leave_percentage'].max() + 0.005  # Add buffer for upper range

# Create the figure
fig = go.Figure()

# Add the sick_leave_percentage to the left y-axis
fig.add_trace(
    go.Scatter(
        x=filtered_sk['period'], 
        y=filtered_sk['sick_leave_percentage'],
        mode='lines', 
        name='CBS Sick Leave Percentage', 
        line=dict(color='#0078d2'),
        yaxis='y1'
    )
)

# Add the avg_temp to the right y-axis
fig.add_trace(
    go.Scatter(
        x=filtered_sk['period'], 
        y=filtered_sk['avg_temp'], 
        mode='lines', 
        name='KNMI Average Temperature', 
        line=dict(color='#ff6b08'),
        yaxis='y2'
    )
)

# Update the layout for two y-axes with customized settings
fig.update_layout(
    title='Weathering the Sick Days: How Temperature Affects Absenteeism',
    xaxis=dict(
        title='Period',
        showgrid=False,  # Enable gridlines
        gridcolor='lightgrey',  # Grey vertical gridlines
    ),
    yaxis=dict(
        title='Sick Leave Percentage',
        titlefont=dict(color='#0078d2'),
        tickfont=dict(color='#0078d2'),
        showgrid=True,  # Enable gridlines
        gridcolor='lightgrey',  # Grey horizontal gridlines
        range=[0, 8],  # Expanded range for sick leave percentage (3% to 7%)
            ),
    yaxis2=dict(
        title='Average Temperature Celcius',
        titlefont=dict(color='#ff6b08'),
        tickfont=dict(color='#ff6b08'),
        anchor='x',
        overlaying='y',
        side='right',
        showgrid=True,  # Enable gridlines
        gridcolor='lightgrey',  # Grey horizontal gridlines
        range=[0, 20]  # Expanded range for average temperature (0 to 25 degrees after division)
    ),
    legend=dict(
        x=1.1,  # Move the legend to the top-right corner
        y=1.4,
        xanchor='right',
        font=dict(
            family='Roboto',  # Set font to Roboto
            size=12
        )
    ),
    font=dict(
        family='Roboto',  # Set global font to Roboto
        size=14
    ),
    plot_bgcolor='white',  # Set background color to white
)

# Define the width and height of the plot (adjust as needed)
plot_width = 1000  # Width of the plot in pixels
plot_height = 600  # Height of the plot in pixels



# Save the figure as an interactive HTML file
fig.write_html("sick_leave_vs_temperature.html", include_plotlyjs='cdn', full_html=True)

print("Plot saved as PNG and HTML!")


# Show the plot
fig.show()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_sk['avg_temp'] = filtered_sk['avg_temp'] / 10


Plot saved as PNG and HTML!


In [3]:
# Calculate Pearson correlation between sick_leave_percentage and avg_temp
correlation = filtered_sk['sick_leave_percentage'].corr(filtered_sk['avg_temp'])
print(f"Pearson correlation: {correlation}")

Pearson correlation: -0.4265515447796651


In [4]:
import plotly.express as px

# Create a scatter plot to visualize the relationship
fig = px.scatter(
    filtered_sk, 
    x='avg_temp', 
    y='sick_leave_percentage', 
    trendline='ols',  # Optional: Add a trendline (linear regression)
    labels={
        'avg_temp': 'Average Temperature (Divided by 10)',
        'sick_leave_percentage': 'Sick Leave Percentage'
    },
    title='Correlation between Sick Leave Percentage and Average Temperature'
)

# Show the scatter plot
fig.show()


In [5]:
import statsmodels.api as sm

# Ensure the data types are numeric
filtered_sk['avg_temp'] = filtered_sk['avg_temp'].astype(float)
filtered_sk['sick_leave_percentage'] = filtered_sk['sick_leave_percentage'].astype(float)

# Add a constant to the independent variable for regression
X = sm.add_constant(filtered_sk['avg_temp'])

# Fit the ordinary least squares (OLS) model
model = sm.OLS(filtered_sk['sick_leave_percentage'], X).fit()

# Print the summary of the regression model
print(model.summary())


                              OLS Regression Results                             
Dep. Variable:     sick_leave_percentage   R-squared:                       0.182
Model:                               OLS   Adj. R-squared:                  0.169
Method:                    Least Squares   F-statistic:                     14.23
Date:                   Fri, 13 Dec 2024   Prob (F-statistic):           0.000355
Time:                           15:33:59   Log-Likelihood:                -54.510
No. Observations:                     66   AIC:                             113.0
Df Residuals:                         64   BIC:                             117.4
Df Model:                              1                                         
Covariance Type:               nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          4.9450 



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

