# Imports and configs

In [1]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from random import gauss
from random import seed
from pandas import Series
from pandas.plotting import autocorrelation_plot
from matplotlib import pyplot
from plotly.subplots import make_subplots
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
import matplotlib.pyplot as plt
from statsmodels.tsa.stattools import adfuller
from statsmodels.stats.diagnostic import acorr_ljungbox
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error
import pandas as pd
import numpy as np
import warnings
from sklearn import metrics

In [2]:
pd.options.plotting.backend = "plotly"

In [3]:
data_path = "https://storage.googleapis.com/edulabs-public-datasets/callcenter.csv"

# Load the data

In [4]:
df = pd.read_csv(data_path, parse_dates=['QueueStartDate'], dayfirst=True)

In [5]:
df.head()

Unnamed: 0,QueueStartDate,QueueStartDateName,Interval,CallTypeName,CallsOffered,CallsAnswered,CallsAbonded,CallbackRequest,CallsTransfered,WaitDuration,...,CallDuration,MaxCallDuration,AvgCallDuration,P_CallsAnswered,P_CallsAbonded,P_CallsTransfered,P_CallbackRequest,TotalAgents,TotalSupervisors,TotalAgentsAndSupervisors
0,2024-01-01,Monday,08:00 - 08:30,HdKamuti,2,1,1,0,0,933,...,116,116,116,50.0,50.0,0.0,0.0,36.0,2.0,38.0
1,2024-01-01,Monday,08:00 - 08:30,ShlihimCB,7,6,1,0,0,414,...,1029,503,171,85.71,14.29,0.0,0.0,36.0,2.0,38.0
2,2024-01-01,Monday,08:30 - 09:00,HouseDelivery,6,5,0,1,0,83,...,596,283,99,100.0,0.0,0.0,16.67,42.0,2.0,44.0
3,2024-01-01,Monday,08:30 - 09:00,SuperVisor_L,6,5,1,0,0,149,...,1428,435,285,83.33,16.67,0.0,0.0,42.0,2.0,44.0
4,2024-01-01,Monday,09:00 - 09:30,Appointments,58,41,7,10,0,14956,...,3401,181,66,87.93,12.07,0.0,17.24,44.0,3.0,47.0


# Preprocess the data

- Create a pandas series with date index, series should contain one element (row) per total callsOffered per day
- We are going to analyze and later predict amount of calls per day


In [None]:
ts = df[['QueueStartDate', 'CallsOffered']].set_index('QueueStartDate').squeeze().groupby(pd.Grouper(freq='D')).sum()

In [None]:
#ts.reset_index()["QueueStartDate"].dt.isocalendar().week

# Data visualization

Display various plots and try to determine the following:
- Does data have structure or it looks like white noise / random walk
- Is there a trend in data?
- Is there seasonality?

Try the following plots:
- Lines
- Histograms
- Boxes
- Heat maps
- Other

In [None]:
ts.plot()

In [None]:
px.box(ts, color=ts.index.day_name())

In [None]:
px.box(ts, color=ts.index.month_name())

# Is data stationary?

**Determine whether your data is stationary, and if not - remove trends / seasonalities**

1. Run ADF test to find whther your data is stationary
2. If the data is not stationary - use differencing techniques to remove trend and seasonality
3. Run ADF test again after performing differencing to see whether data is stationary.
4. Run steps 2-3 again if needed

In [None]:
# Run ADF test
result = adfuller(ts)

# Display results
print("ADF Statistic:", result[0])
print("p-value:", result[1])
for key, value in result[4].items():
    print(f"Critical Value ({key}): {value}")

In [None]:
# lets try and remove seasonality (weekly)
ts.diff(7).dropna().plot()

The data looks quite stationary with some anomalies
- the drops in calls might indicate holidays (we can axtually double-check it with hebrew calendar)
- the peaks need to be explained

In [None]:
# prompt: get israel public holidays in 2024

import holidays

il_holidays = holidays.Israel(years=2024)

for date, name in sorted(il_holidays.items()):
    print(date, name)


In [None]:
# Run ADF test
result = adfuller(ts.diff(7).dropna())

# Display results
print("ADF Statistic:", result[0])
print("p-value:", result[1])
for key, value in result[4].items():
    print(f"Critical Value ({key}): {value}")

In [None]:
detrended = ts.diff(7).dropna()

In [None]:
px.box(detrended, color=detrended.index.month_name())

# Now, after the data is stationary - check whether it's white noise (can it be forecasted at all?)

- Display ACF plot and check for auto-correlation
- Run Ljung-Box Test


Reminder:
**Autocorrelation Plot (ACF)**
- If no significant autocorrelation at any lag → likely white noise.
- If significant autocorrelation at lag 1 or others → likely AR process (i.e., a differenced random walk).

**Ljung-Box Test**
- A high p-value → series are like white noise
- A low p-value → autocorrelation exists - not a white noise


In [None]:
plot_acf(detrended, lags=25) # Adjust lags as needed
plt.show()

We can clearly see autogorrelation with 1 lag

In [None]:
lb_test = acorr_ljungbox(detrended, lags=10, return_df=True)
print(lb_test)

The data is not a noise by all means and we should be able to forecast it

# **Adding test data**

- Our train data is daily callsOffered for 2024 (01-01-2024 -> 31-12-2024)
- Now we added data for 2025 (01-01-2025 -> 14-01-2025)
- You are going to train your models on data from 2024, select best params using cross-validation, abd in the end you will perform final test on the data for 2025

In [None]:
test_data_path = "https://storage.googleapis.com/edulabs-public-datasets/callcenter2025.csv"

In [None]:
test_ts = pd.read_csv(test_data_path, parse_dates=['QueueStartDate'], dayfirst=True)[['QueueStartDate', 'CallsOfferes']].set_index('QueueStartDate').squeeze()

# **Forecasting with xgboost**

**You are now going to forecast daily calls offered using xgboost algorithm**

In order to use time series in an algorithm that is not naturally built for time-series, you need to manually create time-series features.

- Decide how many lags back you want to include in the forecasting
- Create those lag features (it;s better to write a generic code so you could tune lag (hint: you can use `shift`)
- Add other holiday-based and / or time-based features - like weekday, month, etc
- Don't forget to split your data into train / validation / test
- Use tree-based xgboost
- Display errors and error metrics
- Compare this model to prophet

