In [9]:
#!pip install plotly

In [1]:
import geopandas as gpd
import pandas as pd
from pandas.tseries.holiday import USFederalHolidayCalendar as calendar
import matplotlib.pyplot as plt
import folium
import seaborn as sns
import plotly.express as px

In [2]:
#!pip install statsmodels

In [3]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt
from datetime import datetime
from datetime import timedelta
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.stattools import acf, pacf
from statsmodels.tsa.statespace.sarimax import SARIMAX
from time import time

import warnings
warnings.filterwarnings('ignore')

## Collection events

In [4]:
#waste_aust = pd.read_csv(r'C:\Users\fredericorodrigues\Downloads\waste_collection.csv')  
waste_aust = pd.read_csv('waste_collection.csv')  

In [5]:
waste_aust.dtypes

Report Date      object
Load Type        object
Load Time        object
Load Weight     float64
Dropoff Site     object
Route Type       object
Route Number     object
Load ID           int64
dtype: object

In [6]:
waste_aust['Load Time'] = pd.to_datetime(waste_aust['Load Time'])
waste_aust['Report Date'] = pd.to_datetime(waste_aust['Report Date'])

In [7]:
cal = calendar()
holidays = cal.holidays(start=waste_aust['Load Time'].min(), end=waste_aust['Load Time'].max())

waste_aust['Holiday'] = waste_aust['Load Time'].isin(holidays)

In [8]:
waste_aust['DoW'] = waste_aust['Load Time'].dt.day_name()

waste_aust.head()

Unnamed: 0,Report Date,Load Type,Load Time,Load Weight,Dropoff Site,Route Type,Route Number,Load ID,Holiday,DoW
0,2020-12-08,BULK,2020-12-08 15:02:00,5220.0,TDS LANDFILL,BULK,BU13,899097,False,Tuesday
1,2020-12-08,RECYCLING - SINGLE STREAM,2020-12-08 10:00:00,11140.0,TDS - MRF,RECYCLING - SINGLE STREAM,RTAU53,899078,False,Tuesday
2,2020-12-03,RECYCLING - SINGLE STREAM,2020-12-03 10:34:00,10060.0,BALCONES RECYCLING,RECYCLING - SINGLE STREAM,RHBU10,899082,False,Thursday
3,2020-12-07,SWEEPING,2020-12-07 10:15:00,7100.0,TDS LANDFILL,SWEEPER DUMPSITES,DSS04,899030,False,Monday
4,2020-12-07,RECYCLING - SINGLE STREAM,2020-12-07 16:00:00,12000.0,TDS - MRF,RECYCLING - SINGLE STREAM,RMAU53,899048,False,Monday


In [141]:
df = waste_aust.copy()

In [142]:
df_prod = df.loc[:,['Load Time','Load Type','Load Weight']].copy()
df_prod['Date'] = pd.to_datetime(df_prod['Load Time'], format='%Y-%m-%d')
df_prod['Week'] = df_prod['Date'] - pd.to_timedelta(7, unit='d')
df_prod.sort_values(by='Date')
weeklydf = df_prod.groupby(['Load Type', pd.Grouper(key='Week', freq='W-MON')])['Load Weight'].sum()#.reset_index().sort_values('week')

In [152]:
weeklydf = pd.DataFrame(weeklydf).reset_index()
weeklydf.head(3)

Unnamed: 0,Load Type,Week,Load Weight
0,BAGGED LITTER,2004-12-13,2140.0
1,BAGGED LITTER,2005-01-03,440.0
2,BAGGED LITTER,2005-01-10,400.0


In [153]:
weeklydf.head(3)

Unnamed: 0,Load Type,Week,Load Weight
0,BAGGED LITTER,2004-12-13,2140.0
1,BAGGED LITTER,2005-01-03,440.0
2,BAGGED LITTER,2005-01-10,400.0


In [162]:
weekly=weeklydf.groupby('Week').sum('Load Weight')

In [163]:
start_date = datetime(2005,1,1)
end_date = datetime(2021,12,1)
weekly = weekly[start_date:end_date]

In [166]:
weekly

Unnamed: 0_level_0,Load Weight
Week,Unnamed: 1_level_1
2005-01-03,8.597376e+06
2005-01-10,7.958323e+06
2005-01-17,7.296092e+06
2005-01-24,6.984554e+06
2005-01-31,6.675028e+06
...,...
2021-06-07,1.183401e+07
2021-06-14,1.097690e+07
2021-06-21,1.033262e+07
2021-06-28,9.476290e+06


In [170]:
fig = px.line(x=weekly.index, y=weekly['Load Weight'], labels={'x':'Date', 'y':'Units'})
fig.show()

### Forecast

In [171]:
weekly.isna().sum()

Load Weight    0
dtype: int64

In [172]:
weekly.asfreq(pd.infer_freq(weekly.index))

Unnamed: 0_level_0,Load Weight
Week,Unnamed: 1_level_1
2005-01-03,8.597376e+06
2005-01-10,7.958323e+06
2005-01-17,7.296092e+06
2005-01-24,6.984554e+06
2005-01-31,6.675028e+06
...,...
2021-06-07,1.183401e+07
2021-06-14,1.097690e+07
2021-06-21,1.033262e+07
2021-06-28,9.476290e+06


In [307]:
#infer the frequency of the data
forecast = weekly.asfreq(pd.infer_freq(weekly.index))
start_date = datetime(2005,1,1)
end_date = datetime(2021,3,1)
lim_df = forecast[start_date:end_date]

dates = lim_df.index
lim_df.head(3)

Unnamed: 0_level_0,Load Weight
Week,Unnamed: 1_level_1
2005-01-03,8597376.0
2005-01-10,7958323.0
2005-01-17,7296092.0


In [308]:
lim_df = lim_df.fillna(lim_df['Load Weight'].mean())

In [309]:
# Augmented Dickey-Fuller Test - Checking Stationarity
def perform_adf_test(series):
    result = adfuller(series)
    print('ADF Statistic: %f' % result[0])
    print('p-value: %f' % result[1])

In [310]:
perform_adf_test(lim_df)

ADF Statistic: -3.462083
p-value: 0.009016


In [311]:
first_diff = lim_df.diff()[1:]

perform_adf_test(first_diff)

ADF Statistic: -10.769850
p-value: 0.000000


In [312]:
acf_vals = acf(first_diff)
fig = px.bar(y=acf_vals)
fig.show()

In [313]:
pacf_vals = pacf(first_diff)
fig = px.bar(y=pacf_vals)
fig.show()

In [314]:
train_end = datetime(2019,12,31)
test_end = datetime(2021,3,30)

train_data = first_diff[:train_end]
test_data = first_diff[train_end + timedelta(days=1):test_end]

In [315]:
train_data.tail(3)

Unnamed: 0_level_0,Load Weight
Week,Unnamed: 1_level_1
2019-12-16,-1653580.0
2019-12-23,218096.0
2019-12-30,988617.0


In [316]:
test_data.head(3)

Unnamed: 0_level_0,Load Weight
Week,Unnamed: 1_level_1
2020-01-06,-24624.0
2020-01-13,-1253590.0
2020-01-20,531126.0


In [317]:
first_diff.index.max()
first_diff.index.max()

Timestamp('2021-03-01 00:00:00', freq='W-MON')

In [318]:
my_order = (1,1,1) #(p,d,q) (AR,I,MA)
my_seasonal_order = (0, 1, 0, 52) #
# define model
model = SARIMAX(train_data, order=my_order, seasonal_order=my_seasonal_order)

In [319]:
#fit the model
start = time()
model_fit = model.fit()
end = time()
print('Model Fitting Time:', end - start)

Model Fitting Time: 2.9168810844421387


In [320]:
#summary of the model
print(model_fit.summary())

                                      SARIMAX Results                                      
Dep. Variable:                         Load Weight   No. Observations:                  782
Model:             SARIMAX(1, 1, 1)x(0, 1, [], 52)   Log Likelihood              -10873.593
Date:                             Mon, 14 Mar 2022   AIC                          21753.187
Time:                                     11:54:29   BIC                          21766.962
Sample:                                 01-10-2005   HQIC                         21758.501
                                      - 12-30-2019                                         
Covariance Type:                               opg                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
ar.L1         -0.2347      0.042     -5.552      0.000      -0.318      -0.152
ma.L1         -0.9989      

In [321]:
model_fit.forecast(steps=len(test_data))

2020-01-06   -1.244301e+05
2020-01-13   -1.126990e+06
2020-01-20    5.223526e+05
2020-01-27   -8.308153e+05
2020-02-03    1.149992e+05
                  ...     
2021-02-01    1.148724e+05
2021-02-08   -2.717975e+05
2021-02-15    1.032293e+06
2021-02-22   -6.702479e+05
2021-03-01   -4.292586e+05
Freq: W-MON, Name: predicted_mean, Length: 61, dtype: float64

In [322]:
#get the predictions and residuals
predictions = model_fit.forecast(steps=len(test_data)+6)
predictions_error = pd.Series(predictions, index=test_data.index)
residuals = test_data['Load Weight'] - predictions_error

In [323]:
predictions

2020-01-06   -1.244301e+05
2020-01-13   -1.126990e+06
2020-01-20    5.223526e+05
2020-01-27   -8.308153e+05
2020-02-03    1.149992e+05
                  ...     
2021-03-15    1.106901e+06
2021-03-22   -7.060817e+05
2021-03-29    1.162313e+05
2021-04-05   -4.761717e+05
2021-04-12    1.234514e+06
Freq: W-MON, Name: predicted_mean, Length: 67, dtype: float64

In [324]:
residuals

Week
2020-01-06    9.980612e+04
2020-01-13   -1.265995e+05
2020-01-20    8.773433e+03
2020-01-27    4.908493e+05
2020-02-03   -1.791192e+05
                  ...     
2021-02-01    2.703876e+05
2021-02-08   -3.422162e+06
2021-02-15   -3.189393e+06
2021-02-22    1.215464e+07
2021-03-01   -1.624686e+06
Freq: W-MON, Length: 61, dtype: float64

In [325]:
fig = px.line(x=residuals.index, y=residuals, labels={'x':'Date', 'y':'Error'})
fig.show()

In [326]:
first_diff

Unnamed: 0_level_0,Load Weight
Week,Unnamed: 1_level_1
2005-01-10,-639053.0
2005-01-17,-662231.0
2005-01-24,-311538.0
2005-01-31,-309526.0
2005-02-07,443921.0
...,...
2021-02-01,385260.0
2021-02-08,-3693960.0
2021-02-15,-2157100.0
2021-02-22,11484390.0


In [327]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=first_diff.index, y=first_diff['Load Weight'],
                    mode='lines',
                    name='TimeSeries'))
fig.add_trace(go.Scatter(x=predictions.index, y=predictions,
                    mode='lines',
                    name='Predictions'))

fig.update_layout(paper_bgcolor='rgba(255,255,255)', plot_bgcolor='rgba(0,0,0,0)')
fig.update_xaxes(showline=True, linewidth=1, linecolor='black')
fig.update_yaxes(showline=True, linewidth=1, linecolor='black')
fig.update_xaxes(showgrid=True, gridwidth=0.1, gridcolor='lightskyblue')
#fig.update_yaxes(showgrid=True, gridwidth=0.1, gridcolor='lightskyblue')

fig.show()

In [328]:
print('Mean Absolute Percent Error:', round(np.mean(abs(residuals/test_data['Load Weight'])),4))

Mean Absolute Percent Error: 2.0329


In [329]:
print('Root Mean Squared Error:', np.sqrt(np.mean(residuals**2)))

Root Mean Squared Error: 1757743.7044125372
