In [1]:
import json

import pandas as pd
from datetime import datetime

import prophet.models

from generate_mock_data import MockDataGenerator

generator = MockDataGenerator()

start_date = datetime(2020, 1, 1)
end_date = datetime(2021, 1, 1)

df = pd.DataFrame((data for data in generator.generate_between_dates(start_date, end_date)), columns=generator.get_generation_headers)
df["ts"] = pd.to_datetime(df["ts"], unit="s").dt.tz_localize('UTC').dt.tz_convert('Europe/Paris')
df = df.set_index('ts', drop=False)
df.head()

Importing plotly failed. Interactive plots will not work.


Unnamed: 0_level_0,ts,avg_nb_vehicules_in_per_hour,nb_trucks_in,nb_cars_in,avg_waiting_in,avg_nb_vehicules_out_per_hour,nb_trucks_out,nb_cars_out,avg_waiting_out
ts,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2020-01-01 00:00:00+01:00,2020-01-01 00:00:00+01:00,127,379,2675,16.452864,117,314,2500,6.628455
2020-01-02 00:00:00+01:00,2020-01-02 00:00:00+01:00,123,246,2714,20.283782,117,433,2397,23.270235
2020-01-03 00:00:00+01:00,2020-01-03 00:00:00+01:00,128,308,2769,20.386392,123,257,2699,19.079529
2020-01-04 00:00:00+01:00,2020-01-04 00:00:00+01:00,127,188,2873,13.554466,130,207,2932,12.605381
2020-01-05 00:00:00+01:00,2020-01-05 00:00:00+01:00,71,69,1637,10.424144,86,77,1988,9.058943


In [2]:
df.describe()

Unnamed: 0,avg_nb_vehicules_in_per_hour,nb_trucks_in,nb_cars_in,avg_waiting_in,avg_nb_vehicules_out_per_hour,nb_trucks_out,nb_cars_out,avg_waiting_out
count,366.0,366.0,366.0,366.0,366.0,366.0,366.0,366.0
mean,169.814208,294.363388,3790.142077,18.403927,175.308743,339.724044,3878.021858,16.126762
std,85.557885,140.757137,1947.932952,3.983772,87.978642,157.440566,2011.47919,5.13694
min,67.0,61.0,1541.0,9.229415,79.0,67.0,1822.0,6.628455
25%,122.0,237.0,2667.0,16.471312,117.0,257.0,2500.0,12.605381
50%,127.0,269.0,2769.0,19.888154,130.0,314.0,2801.0,17.770212
75%,264.0,347.0,5826.0,21.006294,264.0,433.0,5812.25,19.326342
max,343.0,699.0,7717.0,25.227702,366.0,825.0,8265.0,26.591024


In [3]:
import prophet
from prophet import Prophet
from prophet.serialize import model_to_json, model_from_json

import optuna
from sklearn.metrics import mean_absolute_error

import json
import logging
logging.getLogger('prophet').setLevel(logging.WARNING)
prophet.models.logger.setLevel("WARN")
prophet.forecaster.logger.setLevel("WARN")
optuna.logging.set_verbosity(optuna.logging.WARNING)
prophet.__version__, optuna.__version__

('1.0', '2.10.0')

In [4]:
class WaitingTimeInSplitter:

    def __init__(self, dataset: pd.DataFrame):
        self.dataset = dataset

    @staticmethod
    def __split_for_cars(prepared_train_df):
        prepared_train_df_nb_cars = prepared_train_df[["ds", "nb_cars_in"]].copy()
        prepared_train_df_nb_cars.rename(columns={'nb_cars_in':'y'}, inplace=True)
        return prepared_train_df_nb_cars

    @staticmethod
    def __split_for_trucks(prepared_train_df):
        prepared_train_df_nb_trucks = prepared_train_df[["ds", "nb_trucks_in"]].copy()
        prepared_train_df_nb_trucks.rename(columns={'nb_trucks_in':'y'}, inplace=True)
        return prepared_train_df_nb_trucks

    @staticmethod
    def __prepare_df(sub_dataset):
        prepared_df = sub_dataset.copy()
        prepared_df.reset_index(inplace=True, drop=True)
        prepared_df.rename(columns={'ts':'ds', 'avg_waiting_in': 'y'}, inplace=True)
        prepared_df["ds"] = prepared_df["ds"].dt.tz_localize(None)
        return prepared_df

    def split(self, train_percentage, include_test = True):
        nb_rows = int(df.shape[0] * train_percentage)
        prepared_train_df = WaitingTimeInSplitter.__prepare_df(self.dataset[:nb_rows])
        prepared_train_cars_df = WaitingTimeInSplitter.__split_for_cars(prepared_train_df)
        prepared_train_trucks_df = WaitingTimeInSplitter.__split_for_trucks(prepared_train_df)
        if not include_test:
            return prepared_train_df, prepared_train_cars_df, prepared_train_trucks_df
        prepared_test_df = WaitingTimeInSplitter.__prepare_df(self.dataset[nb_rows:])
        prepared_test_cars_df = WaitingTimeInSplitter.__split_for_cars(prepared_test_df)
        prepared_test_trucks_df = WaitingTimeInSplitter.__split_for_trucks(prepared_test_df)
        return ((prepared_train_df, prepared_test_df), (prepared_train_cars_df, prepared_test_cars_df),
                (prepared_train_trucks_df, prepared_test_trucks_df))

In [5]:
class WaitingTimeInPredictor:

    def __init__(self, prophet_waiting_time = None, prophet_nb_cars = None, prophet_nb_trucks = None):
        self.prophet_waiting_time = prophet_waiting_time or Prophet()
        self.prophet_nb_cars = prophet_nb_cars or Prophet()
        self.prophet_nb_trucks = prophet_nb_trucks or Prophet()

    @classmethod
    def load(cls, serialized):
        return cls(**{k: model_from_json(v) for k, v in json.loads(serialized).items()})

    def add_regressor(self, regressor):
        self.prophet_waiting_time.add_regressor(regressor)

    def add_country_holidays(self, country_name):
        self.prophet_waiting_time.add_country_holidays(country_name=country_name)
        self.prophet_nb_cars.add_country_holidays(country_name=country_name)
        self.prophet_nb_trucks.add_country_holidays(country_name=country_name)

    def fit(self, dataset, train_ratio = 0.8):
        dataset_splitter = WaitingTimeInSplitter(dataset)
        waiting_train_df, cars_train_df, trucks_train_df = dataset_splitter.split(train_percentage=train_ratio, include_test=False)
        self.prophet_waiting_time.fit(waiting_train_df)
        self.prophet_nb_cars.fit(cars_train_df)
        self.prophet_nb_trucks.fit(trucks_train_df)

    def predict(self, date):
        ts_df = pd.DataFrame([date], columns=["ds"])
        nb_cars_prediction = self.prophet_nb_cars.predict(ts_df)[["ds", "yhat"]].rename(columns={"yhat": "nb_cars_in"})
        nb_trucks_prediction = self.prophet_nb_trucks.predict(ts_df)[["ds", "yhat"]].rename(columns={"yhat": "nb_trucks_in"})
        merged_predictions = nb_cars_prediction.merge(nb_trucks_prediction, on="ds")
        waiting_prediction = self.prophet_waiting_time.predict(merged_predictions)
        return waiting_prediction["yhat"].to_numpy()[0]

    def serialize(self):
        return json.dumps({
            "prophet_waiting_time": model_to_json(self.prophet_waiting_time),
            "prophet_nb_cars": model_to_json(self.prophet_nb_cars),
            "prophet_nb_trucks": model_to_json(self.prophet_nb_trucks)
        })

In [None]:
class NbCarsInPredictor:

    def __init__(self, prophet_nb_cars = None ):
        self.prophet_nb_cars = prophet_nb_cars or Prophet()
    

    @classmethod
    def load(cls, serialized):
        return cls(**{k: model_from_json(v) for k, v in json.loads(serialized).items()})

    #garder car

    def add_country_holidays(self, country_name):
        self.prophet_nb_cars.add_country_holidays(country_name=country_name)
    

    def fit(self, dataset, train_ratio = 0.8):
        dataset_splitter = WaitingTimeInSplitter(dataset)
        waiting_train_df, cars_train_df, trucks_train_df = dataset_splitter.split(train_percentage=train_ratio, include_test=False)
        self.prophet_nb_cars.fit(cars_train_df)
     

    def predict(self, date):
        ts_df = pd.DataFrame([date], columns=["ds"])
        nb_cars_prediction = self.prophet_nb_cars.predict(ts_df)[["ds", "yhat"]].rename(columns={"yhat": "nb_cars_in"})
        return  nb_cars_prediction["yhat"].to_numpy()[0]

    def serialize(self):
        return json.dumps({
            "prophet_nb_cars": model_to_json(self.prophet_nb_cars)
        })

In [6]:
class WaitingTimeInOptimizer:

    def __init__(self, dataset: pd.DataFrame, train_ratio = 0.8, direction: str = 'minimize'):
        self.direction = direction
        self.dataset = dataset
        self.train_ratio = train_ratio

    def optimize(self, predictor: WaitingTimeInPredictor, n_trials):
        dataset_splitter = WaitingTimeInSplitter(self.dataset)
        waiting_dfs, cars_dfs, trucks_dfs = dataset_splitter.split(train_percentage=self.train_ratio)

        study_cars = optuna.create_study(direction=self.direction)
        study_cars.optimize(lambda t : self.objective(train=cars_dfs[0], test=cars_dfs[1], trial=t), n_trials=n_trials)
        predictor.prophet_nb_cars = Prophet(**study_cars.best_params)


    @staticmethod
    def objective(train, test, trial, regressors = None):
        params = {
            'changepoint_prior_scale': trial.suggest_float('changepoint_prior_scale', 0.005, 5),
            'changepoint_range': trial.suggest_float('changepoint_range', 0.8, 0.9),
            'seasonality_prior_scale': trial.suggest_float('seasonality_prior_scale', 0.1, 10),
            'holidays_prior_scale': trial.suggest_float('holidays_prior_scale', 0.1, 10),
            'seasonality_mode': trial.suggest_categorical('seasonality_mode', ['multiplicative', 'additive']),
            # 'growth': trial.suggest_categorical('growth', ['linear', 'logistic']), => ValueError: Capacities must be supplied for logistic growth in column "cap"
            'growth': trial.suggest_categorical('growth', ['linear']),
            'weekly_seasonality': trial.suggest_int('weekly_seasonality', 5, 10),
            'yearly_seasonality': trial.suggest_int('yearly_seasonality', 1, 20)
        }
        m = Prophet(**params)
        m.add_country_holidays(country_name='FR')
        if regressors:
            for r in regressors:
                m.add_regressor(r)
        m.fit(train)
        predictions = m.predict(test)
        mae_score = mean_absolute_error(test['y'], predictions['yhat'])
        return mae_score

In [7]:
import os

# https://github.com/facebook/prophet/issues/223
class suppress_stdout_stderr(object):
    """
    A context manager for doing a "deep suppression" of stdout and stderr in
    Python, i.e. will suppress all print, even if the print originates in a
    compiled C/Fortran sub-function.
       This will not suppress raised exceptions, since exceptions are printed
    to stderr just before a script exits, and after the context manager has
    exited (at least, I think that is why it lets exceptions through).
    """
    def __init__(self):
        # Open a pair of null files
        self.null_fds = [os.open(os.devnull, os.O_RDWR) for x in range(2)]
        # Save the actual stdout (1) and stderr (2) file descriptors.
        self.save_fds = [os.dup(1), os.dup(2)]

    def __enter__(self):
        # Assign the null pointers to stdout and stderr.
        os.dup2(self.null_fds[0], 1)
        os.dup2(self.null_fds[1], 2)

    def __exit__(self, *_):
        # Re-assign the real stdout/stderr back to (1) and (2)
        os.dup2(self.save_fds[0], 1)
        os.dup2(self.save_fds[1], 2)
        # Close the null files
        for fd in self.null_fds + self.save_fds:
            os.close(fd)

In [8]:
to_predict = df[int(df.shape[0] * 0.8):int(df.shape[0] * 0.8) + 1]

predictor = WaitingTimeInPredictor()
predictor.add_country_holidays(country_name='FR')
predictor.add_regressor('nb_trucks_in')
predictor.add_regressor('nb_cars_in')

with suppress_stdout_stderr():
    predictor.fit(df, 0.8)

predicted_waiting_in = predictor.predict(to_predict["ts"].dt.tz_localize(None).to_numpy()[0])
print(f"Prediction[{to_predict['ts'].to_numpy()[0]}] => expected={to_predict['avg_waiting_in'].to_numpy()[0]}, predicted={predicted_waiting_in}")

Prediction[2020-10-19 00:00:00+02:00] => expected=23.520029246537447, predicted=22.70727874997336


In [9]:
to_predict = df[int(df.shape[0] * 0.8):int(df.shape[0] * 0.8) + 1]

predictor_optimized = WaitingTimeInPredictor()
predictor_optimized.add_country_holidays(country_name='FR')
predictor_optimized.add_regressor('nb_trucks_in')
predictor_optimized.add_regressor('nb_cars_in')

optimizer = WaitingTimeInOptimizer(df)
with suppress_stdout_stderr():
    optimizer.optimize(predictor_optimized, 10)

with suppress_stdout_stderr():
    predictor_optimized.fit(df, 0.8)

predicted_waiting_in_optimized = predictor_optimized.predict(to_predict["ts"].dt.tz_localize(None).to_numpy()[0])
print(f"Prediction[{to_predict['ts'].to_numpy()[0]}] => expected={to_predict['avg_waiting_in'].to_numpy()[0]}, predicted={predicted_waiting_in_optimized}")

Prediction[2020-10-19 00:00:00+02:00] => expected=23.520029246537447, predicted=22.860816501618327


In [10]:
serialized_model = predictor.serialize()
loaded_model = WaitingTimeInPredictor.load(serialized_model)
predicted_prediction = loaded_model.predict(to_predict["ts"].dt.tz_localize(None).to_numpy()[0])
print(f"Prediction[{to_predict['ts'].to_numpy()[0]}] => expected={to_predict['avg_waiting_in'].to_numpy()[0]}, predicted={predicted_prediction}")

Prediction[2020-10-19 00:00:00+02:00] => expected=23.520029246537447, predicted=22.70727874997336
