# Installing prophet

In [1]:
# pip install prophet

In [2]:
# Importing libraries
from sklearn.metrics import mean_absolute_error
import matplotlib.pyplot as plt
from prophet import Prophet
from tqdm import tqdm
import pandas as pd
import numpy as np
import logging

logging.getLogger('prophet').setLevel(logging.WARNING) 

## Importing train and test data
congestion = pd.read_csv('/Users/gabrielmedeiros/Desktop/Data_Science_Competitions/Comp1/train.csv')
test_data = pd.read_csv('/Users/gabrielmedeiros/Desktop/Data_Science_Competitions/Comp1/test.csv')




## Here I add the columns as strings to create the new column
congestion['location_id'] = pd.DataFrame(
                 'x' + congestion.x.astype(str) +\
                 'y' + congestion.y.astype(str) + \
                 congestion.direction.astype(str))

## Here I subset the data set into a train data for predicitons
train_prophet = congestion[['location_id', 'time' ,'congestion']]

## Here I rename the columns to match prophet's requirements (ds for dates, and y for value to be predicted)
train_prophet.columns = ['location_id','ds','y']

           
    
## Here I add the columns as strings to create the new column
test_data['location_id'] = pd.DataFrame(
                 'x' + congestion.x.astype(str) +\
                 'y' + congestion.y.astype(str) + \
                 congestion.direction.astype(str))

## Here I subset the data set into a train data for predicitons
test_prophet = test_data[['location_id', 'time']]

## Here I rename the columns to match prophet's requirements (ds for dates, and y for value to be predicted)
test_prophet.columns = ['location_id','ds']




## This data set retrieves an n number of days, in order, based on a single id match
## By doing so, I am able to retrieve a fixed number of days to be predicted for each combination
## Prophet requires the creation o an empty data frame so we can allocate the results.

## The train dataset has days of which we already have the congestion data, so we can evaluate the predictions
future_train_full = pd.DataFrame(train_prophet['ds'][train_prophet['location_id'] == 'x2y3NE'].reset_index(drop = True))

future_train = future_train_full.iloc[0:13023,:]

future_val = future_train_full.iloc[13023:13059,:]


## The test dataset contains days of whych we do not have congestion data, which are the days to be submited
future_test = pd.DataFrame(test_prophet['ds'][test_prophet['location_id'] == 'x2y3NE'].reset_index(drop = True))



## When using prophet, we need the train and test data sets in one data set
## Hence, I concatenated both data frames I just created 

future = pd.concat([future_train_full, future_test])

Importing plotly failed. Interactive plots will not work.


## Cross-Validation

In [3]:
# import itertools
# import numpy as np
# import pandas as pd
# from prophet.diagnostics import cross_validation, performance_metrics







# param_grid = {  
#     'changepoint_prior_scale': [0.001, 0.01, 0.1, 0.5],
#     'seasonality_prior_scale': [0.01, 0.1, 1.0, 10.0],
# }

# # Generate all combinations of parameters
# all_params = [dict(zip(param_grid.keys(), v)) for v in itertools.product(*param_grid.values())]
# results = -pd.DataFrame()  # Store the RMSEs for each params here




# ## Here I create a group by for each location id, so we can have predictions based on each individual location
# locations = train_prophet.groupby('location_id')





# for params in tqdm(all_params):

#     ## Here I loop through the groups
#     for g in locations.groups:

#         ## Here I specify the group to be used (g would be here 'x0y0EB' as an example)
#         group_train = locations.get_group(g)

#         ## Here we define a new prophet model so we can refresh it and use a model for each group
#         ## Prophet will not allow one Prophet() model to be ran multiple times, that is why a new one
#         ## is called for each loop
#         m = Prophet(**params).fit(group_train)  # Fit model with given params
#         df_cv = cross_validation(m, initial = '150 days',horizon = '12 hours',parallel="processes")
#         df_p = performance_metrics(df_cv, rolling_window=1)
        
        
#         results = pd.concat([results, pd.DataFrame([df_p['rmse'].values[0],params])], ignore_index=True)

In [4]:
# params_df = pd.DataFrame(results)
# params_df.columns = ['rmse','parameters']
# params_df['parameters'] = params_df['parameters'].astype('str')


# params_df.groupby('parameters')['rmse'].mean().sort_values()

In [9]:
## This avoids prophet from printing warning messages, but still allows tqdm to show us a progress bar
logging.getLogger('prophet').setLevel(logging.WARNING) 

## Creating empty data frame to store results
results = pd.DataFrame()


## Here I create a group by for each location id, so we can have predictions based on each individual location
locations = train_prophet.groupby('location_id')

## Here I loop through the groups
for g in tqdm(locations.groups):
    
    ## Here I specify the group to be used (g would be here 'x0y0EB' as an example)
    group_train = locations.get_group(g)
    
    ## Here we define a new prophet model so we can refresh it and use a model for each group
    ## Prophet will not allow one Prophet() model to be ran multiple times, that is why a new one
    ## is called for each loop
    prophet_md = Prophet(changepoint_prior_scale = 0.001, seasonality_prior_scale = 0.01)
    
    ## Here, the group data is fittet into the model
    prophet_fit = prophet_md.fit(group_train)
    
    ## Here, prophet predicts congestion based on the dates specified on the 'future' data set
    forecast = prophet_fit.predict(future)
    
    ## This creates a new column in our data frame and appends the group name being used for that specific loop
    forecast['location_id'] = g
    
    ## Here we store the results in a single data frame
    results = pd.concat([results, forecast], ignore_index=True)



100%|██████████| 65/65 [04:48<00:00,  4.44s/it]


In [10]:
## Changing time format for visualizations
train_prophet['ds'] = pd.to_datetime(train_prophet['ds'])
future_val['ds'] = pd.to_datetime(future_val['ds'])



validation_to_evaluate = train_prophet.loc[train_prophet['ds'].isin(future_val.ds)]
validation_to_evaluate = validation_to_evaluate.sort_values(by=['location_id','ds'])

train_to_evaluate = train_prophet.loc[train_prophet['ds'].isin(future_train.ds)]
train_to_evaluate = train_to_evaluate.sort_values(by=['location_id','ds'])

val_results_to_evaluate = results.loc[results['ds'].isin(future_val.ds)]
train_results_to_evaluate = results.loc[results['ds'].isin(future_train.ds)]

print('Validation mae: {}'.format(mean_absolute_error(validation_to_evaluate.y, val_results_to_evaluate.yhat)))
print('Train mae: {}'.format(mean_absolute_error(train_to_evaluate.y, train_results_to_evaluate.yhat)))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_prophet['ds'] = pd.to_datetime(train_prophet['ds'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  future_val['ds'] = pd.to_datetime(future_val['ds'])


Validation mae: 7.226606785557371
Train mae: 6.754171223632951


## Performance Evaluation

In [None]:
## Here, I ploted both train and test data

plt.figure(figsize = (20,15))
plt.plot(results.ds, results.yhat, color = 'blue')
# plt.plot(results.ds, results.yhat_upper, color = 'red')
# plt.plot(results.ds, results.yhat_lower, color = 'red')
plt.title('Congestion Levels Preds')
plt.xlabel('Date')
plt.ylabel('Congestion')
plt.xticks(rotation = 45)
plt.grid()
plt.show()

In [None]:
## This plot has the acutal data 
plt.figure(figsize = (20,15))
plt.plot(train_prophet.ds, train_prophet.y, color = 'blue')
plt.title('Congestion Levels')
plt.xlabel('Date')
plt.ylabel('Congestion')
plt.xticks(rotation = 45)
plt.grid()
plt.show()

In [11]:
results_to_append = results.loc[results['ds'].isin(test_prophet.ds)]
validation_to_append =  results.loc[results['ds'].isin(future_val.ds)]

In [25]:
test_data['time'] = pd.to_datetime(test_data['time'])
congestion['time'] = pd.to_datetime(congestion['time'])

In [35]:
## Example of submission
submission_test_data = test_data.merge(results_to_append, 
                                  how = 'left',
                                  left_on = ['time','location_id'], 
                                  right_on = ['ds','location_id'])

submission_test_data = submission_test_data[['row_id','yhat']]
submission_test_data.columns = ['row_id','congestion']


## Example of submission
submission_validation_data = congestion.merge(validation_to_append, 
                                  how = 'right',
                                  left_on = ['time','location_id'], 
                                  right_on = ['ds','location_id'])

submission_validation_data = submission_validation_data[['row_id','yhat']]
submission_validation_data.columns = ['row_id','congestion']


In [36]:
submission_test_data.to_csv('submission_test_data.csv',index = False)
submission_validation_data.to_csv('submission_validation_data.csv',index = False)
