# [Cross Validating Prophet at Scale](https://medium.com/dropout-analytics/cross-validating-prophet-at-scale-72b1a21b6433?source=friends_link&sk=e7b1201491dd528dfa3ad3b9a324518c)
## Parallelizing Time Series Cross-Validation and Hyperparameter Optimization with Dask

In [None]:
import pandas as pd

df = pd.read_csv('https://raw.githubusercontent.com/gumdropsteve/datasets/master/yellow_cab_ymd_averages.csv', usecols=['ds', 'avg_trip_distance'])

df['ds'] = pd.to_datetime(df['ds'])
df['avg_trip_distance'] = df['avg_trip_distance'].astype('float')

# remove erroneous averages
df = df.loc[df['avg_trip_distance'] < 5]

# relabel average trip distance as y
df = df.rename(columns={'avg_trip_distance' : 'y'})

df.plot(x='ds', y='y', figsize=(16, 4), title='Average Trip Distance (Miles) - NYC Yellow Cab', )

In [None]:
from fbprophet import Prophet

m = Prophet(daily_seasonality=False)

m.add_country_holidays(country_name='US')

m.fit(df)

In [None]:
future = m.make_future_dataframe(periods=365*4)

forecast = m.predict(future)

fig = m.plot(forecast)

In [None]:
forecast

In [None]:
m.plot_components(forecast)

## Distributing Cross Validation with Dask

In [None]:
from distributed import Client
from fbprophet.diagnostics import cross_validation

client = Client()
client

**With Dask**

In [None]:
%%time
df_cv = cross_validation(m, 
                         horizon="365 days",
                         period="182.5 days", 
                         initial="730 days", 
                         parallel="dask"
                        )

**Default (None)**

In [None]:
%%time
df_cv = cross_validation(m, 
                         horizon="365 days",
                         period="182.5 days", 
                         initial="730 days", 
                         parallel=None
                        )

In [None]:
from fbprophet.diagnostics import performance_metrics

performance_metrics(df_cv, rolling_window=1)

## Hyperparameter Optimization with Dask (Applying Parallelized Cross-Validation)

In [None]:
%%time
import itertools

param_grid = {'changepoint_prior_scale': [0.001, 0.01, 0.1, 0.5],
              'seasonality_prior_scale': [0.01, 0.1, 1.0, 10.0],
              'seasonality_mode': ['additive', 'multiplicative']}

# generate all combinations of parameters
all_params = [dict(zip(param_grid.keys(), v)) for v in itertools.product(*param_grid.values())]
rmses = []  # store the RMSEs for each params here

# Use cross validation to evaluate all parameters
for i in range(len(all_params)):
    print(f'cross validating param set {i} / {len(all_params) - 1}')
    
    # create & fit model with given params combo
    m = Prophet(daily_seasonality=False, **all_params[i]).fit(df)
    
    # cross validate the model 
    df_cv = cross_validation(m, 
                             horizon="365 days",
                             period="182.5 days", 
                             initial="730 days", 
                             parallel="dask")
    
    df_p = performance_metrics(df_cv, rolling_window=1)
    # pull rmse and add it to the list
    rmses.append(df_p['rmse'].values[0])

In [None]:
# make a dataframe of all the param combos
tuning_results = pd.DataFrame(all_params)

# add a column for each's RMSE
tuning_results['rmse'] = rmses

tuning_results

In [None]:
import numpy as np

# pull the highest scoring param combo
best_params = all_params[np.argmin(rmses)]

best_params

### Try the Best Model

In [None]:
m = Prophet(daily_seasonality=False, changepoint_prior_scale=0.001, seasonality_prior_scale=0.1, seasonality_mode='multiplicative')

m.add_country_holidays(country_name='US')

m.fit(df)

In [None]:
future = m.make_future_dataframe(periods=365*4)

forecast = m.predict(future)

fig = m.plot(forecast)

In [None]:
forecast

# Fin
**[Back to GitHub](https://github.com/gumdropsteve/intro_to_prophet)** | **[Back to Medium](https://medium.com/dropout-analytics/cross-validating-prophet-at-scale-72b1a21b6433?source=friends_link&sk=e7b1201491dd528dfa3ad3b9a324518c)**