In [1]:
import numpy as np
import pandas as pd
import data_download as dd
import utils as ut
import sqlite3
from prophet import Prophet
import plotly.graph_objs as go
from datetime import timedelta
from sklearn.metrics import mean_absolute_error, mean_squared_error
from datetime import datetime

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import pandas as pd
import numpy as np

import pandas as pd
import numpy as np

def create_synthetic_data(start_date, end_date, yearly_growth, initial_value, noise=10):
    """
    Create a DataFrame with two columns: 'date' and 'y'.
    The 'date' column ranges from start_date to end_date.
    The 'y' column is generated using a series of sin curves with random noise,
    with an annual increase and a specific initial value.

    :param start_date: str, Start date in 'YYYY-MM-DD' format.
    :param end_date: str, End date in 'YYYY-MM-DD' format.
    :param yearly_growth: float, Rate of yearly growth in 'y'.
    :param initial_value: int, Initial value of 'y' for the first observation.
    :param noise: int, Noise to add to each observation.
    :return: pandas.DataFrame
    """
    # Generate date range
    date_range = pd.date_range(start=start_date, end=end_date)
    df = pd.DataFrame(date_range, columns=['ds'])

    # Daily sine wave for annual seasonal trend (higher in summer, lower in winter)
    days_since_start = (df['ds'] - df['ds'][0]).dt.days
    annual_sine = np.sin(2 * np.pi * days_since_start / 365.25)

    # Adjust amplitude based on the season (higher in summer, lower in winter)
    df['y'] = annual_sine * np.cos((df['ds'].dt.dayofyear + 10) / 365.25 * 2 * np.pi) * 20

    # Weekly trend (peaks on weekends, lower midweek)
    weekly_trend = np.sin(df['ds'].dt.dayofweek / 6 * 2 * np.pi)
    df['y'] += weekly_trend * 10

    # Add random noise
    df['y'] += np.random.normal(0, noise, len(df))

    # Apply yearly growth
    df['y'] += (days_since_start / 365.25) * yearly_growth

    # Set initial value for the first observation and ensure 'y' is an integer
    df['y'] = df['y'] - df['y'].iloc[0] + initial_value
    df['y'] = df['y'].astype(int)

    return df

def create_date_range_dataframe(x, a, b):
    """
    Create a DataFrame with a single column 'ds' that contains dates from x - a to x + b.

    :param x: str, Date in 'YYYY-MM-DD' format
    :param a: int, Number of days before x
    :param b: int, Number of days after x
    :return: pandas.DataFrame
    """
    # Convert string to datetime
    base_date = pd.to_datetime(x)

    # Calculate start and end dates
    start_date = base_date - timedelta(days=a)
    end_date = base_date + timedelta(days=b)

    # Generate date range
    dates = pd.date_range(start=start_date, end=end_date, freq='D')

    # Create DataFrame
    df = pd.DataFrame(dates, columns=['ds'])

    return df



In [None]:
# Get the latest data
dd.get_historical_api_data()

In [20]:
ut.update_data()

Data inserted successfully!


In [21]:
# Load config dict
config = ut.load_config()

# Connect to the SQLite database
conn = sqlite3.connect(config['db_name'])

# Query the database to fetch the required data
raw = pd.read_sql_query("SELECT * FROM reservations", conn)

# Close the connection to the database
conn.close()

In [22]:
df = raw.copy()
df['ds'] = df['CREATEDUTC'].str[:10]
df['ds'] = pd.to_datetime(df['ds'])
df = df[df['ds'] > '2022-08-01']
grouped_df = df.groupby('ds').size().reset_index(name='y')
grouped_df.tail()

Unnamed: 0,ds,y
314,2023-11-13,11
315,2023-11-14,14
316,2023-11-15,5
317,2023-11-16,17
318,2023-11-17,7


In [6]:
# Example usage
start_date = "2021-01-01"
end_date = datetime.now().date()
yearly_growth = 100  # Adjust as needed
initial_value = 500  # Set the initial value of y
grouped_df = create_synthetic_data(start_date, end_date, yearly_growth, initial_value)
print(grouped_df.head())


          ds    y
0 2021-01-01  500
1 2021-01-02  522
2 2021-01-03  526
3 2021-01-04  527
4 2021-01-05  533


In [7]:
# Split the dataset into train and test sets
train_size = int(0.8 * len(grouped_df))
train_df = grouped_df[:train_size]
test_df = grouped_df[train_size:]

# Create and fit the model
model = Prophet()
model.fit(train_df)

# Make future predictions
future = pd.DataFrame(test_df['ds'])
test_pred = model.predict(future)

# Evaluate the model
# Align the forecast with the test set
forecast_test = test_pred.set_index('ds').join(test_df.set_index('ds'))
mae = mean_absolute_error(forecast_test['y'], forecast_test['yhat'])
rmse = np.sqrt(mean_squared_error(forecast_test['y'], forecast_test['yhat']))
print(f'MAE: {mae}, RMSE: {rmse}')

19:31:53 - cmdstanpy - INFO - Chain [1] start processing
19:31:54 - cmdstanpy - INFO - Chain [1] done processing


MAE: 8.396816055261139, RMSE: 10.789644717991553


In [22]:
# Plot the results using Plotly
fig = go.Figure()

# Train set
fig.add_trace(go.Scatter(x=train_df['ds'], y=train_df['y'], name='Test Data', mode='lines', line=dict(width=1.0)))

# Test set
fig.add_trace(go.Scatter(x=test_df['ds'], y=test_df['y'], name='Test Data', mode='lines', line=dict(color='green')))

# Predictions
fig.add_trace(go.Scatter(x=test_pred['ds'], y=test_pred['yhat'], mode='lines', 
                         marker=dict(color='red'), name='Predictions'))


# Confidence intervals
fig.add_trace(go.Scatter(x=test_pred['ds'], y=test_pred['yhat_upper'], name='Upper Bound', mode='lines', line=dict(width=0, color='red')))
fig.add_trace(go.Scatter(x=test_pred['ds'], y=test_pred['yhat_lower'], name='Lower Bound', mode='lines', line=dict(width=0, color='red'), fill='tonexty'))


fig.update_layout(title='Prophet Predictions with Confidence Intervals', xaxis_title='Date', yaxis_title='Value')
fig.show()

In [15]:
# Redo the whole next bit so that forecasts and historical stuff are separatedfo9
end_date = grouped_df['ds'].max()
pred_window = create_date_range_dataframe(end_date, 0, 7)
forecast = model.predict(pred_window)

# Get most recent data
latest_data = grouped_df.tail(21)

# Plot the results using Plotly
fig = go.Figure()

# Latest Data
fig.add_trace(go.Scatter(x=latest_data['ds'], y=latest_data['y'], name='Test Data', mode='lines+markers'))

# Predictions
# fig.add_trace(go.Scatter(x=forecast['ds'], y=forecast['yhat'], name='Predictions', mode='lines'))
fig.add_trace(go.Scatter(x=forecast['ds'], y=forecast['yhat'], mode='markers', 
                         marker=dict(color='red'), name='Predictions'))


# Confidence intervals
fig.add_trace(go.Scatter(x=forecast['ds'], y=forecast['yhat_upper'], name='Upper Bound', mode='lines', line=dict(width=0, color='red')))
fig.add_trace(go.Scatter(x=forecast['ds'], y=forecast['yhat_lower'], name='Lower Bound', mode='lines', line=dict(width=0, color='red'), fill='tonexty'))

fig.update_layout(title='Prophet Predictions with Confidence Intervals', xaxis_title='Date', yaxis_title='Value')
fig.show()
