In [1]:
import numpy as np

In [2]:
import pandas as pd

In [3]:
df=pd.read_csv("staffing_train.csv")

In [4]:
print(df.head(20))

          date section_id  employees_on_duty  total_task_time_minutes
0   2021-01-01    SEC-001                  1               834.569483
1   2021-01-01    SEC-002                  4              1525.368990
2   2021-01-01    SEC-003                  2               662.095279
3   2021-01-01    SEC-004                  4              1499.112620
4   2021-01-01    SEC-005                  3              1238.533073
5   2021-01-01    SEC-006                 11              4283.154645
6   2021-01-04    SEC-001                  1               767.698506
7   2021-01-04    SEC-002                  7              2717.770275
8   2021-01-04    SEC-003                  6              2485.334696
9   2021-01-04    SEC-004                  6              2097.884599
10  2021-01-04    SEC-005                  4              1556.520829
11  2021-01-04    SEC-006                  6              1873.923819
12  2021-01-05    SEC-001                  4              1530.652495
13  2021-01-05    SE

In [5]:
print(df.tail(20))

            date section_id  employees_on_duty  total_task_time_minutes
5782  2024-12-26    SEC-005                  7              2543.213107
5783  2024-12-26    SEC-006                  8              2955.184394
5784  2024-12-27    SEC-001                  5              1413.942415
5785  2024-12-27    SEC-002                  2              1022.775682
5786  2024-12-27    SEC-003                  2               615.576246
5787  2024-12-27    SEC-004                  4              1288.723110
5788  2024-12-27    SEC-005                 10              3623.308966
5789  2024-12-27    SEC-006                  5              1987.666281
5790  2024-12-30    SEC-001                  3              1476.008272
5791  2024-12-30    SEC-002                  5              2094.648238
5792  2024-12-30    SEC-003                  1               836.813742
5793  2024-12-30    SEC-004                  3               907.255329
5794  2024-12-30    SEC-005                  5              1721

In [6]:
# CHECK 1 – each date has 6 sections
# -----------------------------------
counts = df.groupby('date').size()
dates_missing_sections = counts[counts != 6]
print("Dates that do NOT have all 6 sections:")
print(dates_missing_sections)

Dates that do NOT have all 6 sections:
Series([], dtype: int64)


In [7]:
# CHECK 2 – dates are continuous
# -----------------------------------
# convert 'date' to datetime format if it's still a string
df['date'] = pd.to_datetime(df['date'])


In [8]:
# generate full date range from min to max
full_range = pd.date_range(start=df['date'].min(), end=df['date'].max(), freq='D')


In [9]:
# convert the dataframe dates into a DatetimeIndex and take the difference
missing_dates = full_range.difference(pd.DatetimeIndex(df['date']))

print("Missing calendar dates:")
print(missing_dates)

Missing calendar dates:
DatetimeIndex(['2021-01-02', '2021-01-03', '2021-01-09', '2021-01-10',
               '2021-01-14', '2021-01-16', '2021-01-17', '2021-01-23',
               '2021-01-24', '2021-01-28',
               ...
               '2024-12-01', '2024-12-07', '2024-12-08', '2024-12-14',
               '2024-12-15', '2024-12-21', '2024-12-22', '2024-12-25',
               '2024-12-28', '2024-12-29'],
              dtype='datetime64[ns]', length=494, freq=None)


In [10]:
# Show missing dates with their day of week
missing_dates_df = pd.DataFrame({'date': missing_dates})
missing_dates_df['day_of_week'] = missing_dates_df['date'].dt.day_name()
print(missing_dates_df)

          date day_of_week
0   2021-01-02    Saturday
1   2021-01-03      Sunday
2   2021-01-09    Saturday
3   2021-01-10      Sunday
4   2021-01-14    Thursday
..         ...         ...
489 2024-12-21    Saturday
490 2024-12-22      Sunday
491 2024-12-25   Wednesday
492 2024-12-28    Saturday
493 2024-12-29      Sunday

[494 rows x 2 columns]


In [11]:
# Count missing dates by day of week
day_counts = missing_dates_df['day_of_week'].value_counts()
print(day_counts)

day_of_week
Saturday     209
Sunday       209
Monday        19
Thursday      18
Friday        18
Wednesday     11
Tuesday       10
Name: count, dtype: int64


In [None]:
import matplotlib.pyplot as plt

# Get unique services
services = df['section_id'].unique()

# Set up the figure
fig, axes = plt.subplots(3, 2, figsize=(15, 12), sharex=True)
axes = axes.flatten()

for i, service in enumerate(services):
    service_df = df[df['service'] == service]
    axes[i].plot(service_df['date'], service_df['employees_on_duty'], marker='o')
    axes[i].set_title(f'Service: {service}')
    axes[i].set_xlabel('Date')
    axes[i].set_ylabel('Employees on Duty')
    axes[i].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()