# Corporate Deposits Forecast Model

<a id='toc2_'></a>

## Install the client library

The client library provides Python support for the ValidMind Developer Framework. To install it:

In [1]:
%pip install -q validmind

[0mNote: you may need to restart the kernel to use updated packages.


<a id='toc4_'></a>

## Initialize the Python environment

Next, let's import the necessary libraries and set up your Python environment for data analysis:

In [2]:
%pip install -q pymc prophet

Note: you may need to restart the kernel to use updated packages.


In [3]:
import pandas as pd
import numpy as np
import pymc as pm
import arviz as az

import plotly.express as px
import plotly.graph_objects as go

import validmind as vm

from statsmodels.tsa.seasonal import seasonal_decompose



### Helper functions

In [4]:
import pandas as pd
import numpy as np



In [5]:
def seasonality_model(t, y, fourier_features, alpha_mu=0, alpha_sigma=0.5, beta_mu=0, beta_sigma=0.5, sigma_sigma=0.1, fourier_sigma=10, samples=500, tune=500, seasonality_type='additive'):
    """
    Build and fit a Bayesian seasonality model using PyMC3.

    Parameters:
    - t: Time index (numpy array)
    - y: Target variable (numpy array)
    - fourier_features: Fourier features for seasonality (pandas DataFrame)
    - alpha_mu: Mean of the prior for alpha (intercept)
    - alpha_sigma: Standard deviation of the prior for alpha (intercept)
    - beta_mu: Mean of the prior for beta (slope)
    - beta_sigma: Standard deviation of the prior for beta (slope)
    - sigma_sigma: Standard deviation of the prior for sigma (standard deviation)
    - fourier_sigma: Standard deviation of the prior for Fourier coefficients
    - samples: Number of samples for posterior sampling
    - tune: Number of tuning steps for posterior sampling
    - seasonality_type: Type of seasonality ('additive' or 'multiplicative')

    Returns:
    - model: PyMC3 model object
    - trace: PyMC3 trace object
    - prior_predictive: Prior predictive samples
    """
    # Define the coordinates for the model (dimension for Fourier features)
    coords = {"fourier_features": np.arange(fourier_features.shape[1])}
    
    # Build the PyMC3 model
    with pm.Model(check_bounds=False, coords=coords) as model:
        # Define the prior for alpha (intercept) as a normal distribution
        alpha = pm.Normal("alpha", mu=alpha_mu, sigma=alpha_sigma)
        
        # Define the prior for beta (slope) as a normal distribution
        beta = pm.Normal("beta", mu=beta_mu, sigma=beta_sigma)
        
        # Define the prior for sigma (standard deviation) as a half-normal distribution
        sigma = pm.HalfNormal("sigma", sigma=sigma_sigma)
        
        # Define the prior for Fourier coefficients as a normal distribution
        beta_fourier = pm.Normal("beta_fourier", mu=0, sigma=fourier_sigma, dims="fourier_features")
        
        # Compute the seasonality component as a deterministic variable
        seasonality = pm.Deterministic("seasonality", pm.math.dot(fourier_features.to_numpy(), beta_fourier))
        
        # Compute the trend component as a deterministic variable
        trend = pm.Deterministic("trend", alpha + beta * t)
        
        # Define the expected value of the observed variable
        if seasonality_type == 'multiplicative':
            mu = trend * (1 + seasonality)
        elif seasonality_type == 'additive':
            mu = trend + seasonality
        else:
            raise ValueError("seasonality_type must be either 'additive' or 'multiplicative'")
        
        # Define the likelihood of the observed data
        pm.Normal("likelihood", mu=mu, sigma=sigma, observed=y)

        # Sample from the prior predictive distribution
        prior_predictive = pm.sample_prior_predictive()
        
        # Sample from the posterior distribution
        trace = pm.sample(samples, tune=tune, return_inferencedata=True)
    
    # Return the model, the trace, and the prior predictive samples
    return model, trace, prior_predictive


In [6]:
def plot_prior_predictive_seasonality(df, prior_predictive, y_max, target_column, start_date=None, end_date=None):
    df_reset = df.reset_index()

    # Extract prior predictive samples
    prior_predictive_samples = az.extract(prior_predictive, group="prior_predictive", num_samples=100)["likelihood"]

    # Extract prior trend lines
    prior_trend_lines = az.extract(prior_predictive, group="prior", num_samples=100)["trend"]

    # Extract prior seasonality
    prior_seasonality = az.extract(prior_predictive, group="prior", num_samples=100)["seasonality"]

    # Create Plotly figures for prior predictive distribution
    fig = go.Figure()

    for sample in prior_predictive_samples.T:
        fig.add_trace(go.Scatter(x=df.index, y=sample * y_max, mode='lines', line=dict(color='blue', width=1), opacity=0.05))

    fig.add_trace(go.Scatter(x=df_reset['DATE'], y=df_reset[target_column], mode='lines+markers', marker=dict(color='black', size=5), name='Data'))

    fig.update_layout(title="Prior Predictive", xaxis_title="Date", yaxis_title=target_column)

    if start_date and end_date:
        fig.update_xaxes(range=[start_date, end_date])

    fig.show()

    # Create Plotly figures for prior trend lines
    fig_trend = go.Figure()

    for trend_line in prior_trend_lines.T:
        fig_trend.add_trace(go.Scatter(x=df.index, y=trend_line * y_max, mode='lines', line=dict(color='blue', width=1), opacity=0.05))

    fig_trend.add_trace(go.Scatter(x=df_reset['DATE'], y=df_reset[target_column], mode='lines+markers', marker=dict(color='black', size=5), name='Data'))

    fig_trend.update_layout(title="Prior Trend Lines", xaxis_title="Date", yaxis_title=target_column)

    if start_date and end_date:
        fig_trend.update_xaxes(range=[start_date, end_date])

    fig_trend.show()

    # Create Plotly figures for prior seasonality
    fig_seasonality = go.Figure()

    for season in prior_seasonality.T:
        fig_seasonality.add_trace(go.Scatter(x=df.index[:12], y=season[:12] * 100, mode='lines', line=dict(color='blue', width=1), opacity=0.05))

    fig_seasonality.update_layout(title="Prior Seasonality", xaxis_title="Date", yaxis_title="Percent change")

    fig_seasonality.show()

In [7]:
import plotly.graph_objects as go

import pymc as pm
import arviz as az
import plotly.graph_objects as go

def plot_posterior_predictive_seasonality(df, model, trace, y_max, target_column, period=12, start_date=None, end_date=None):
    """
    Plot the posterior predictive distribution, posterior trend lines, and posterior seasonality for a given model.
    
    Parameters:
    df (pd.DataFrame): DataFrame containing the original data.
    model (pm.Model): The PyMC3 model used for sampling.
    trace (arviz.InferenceData): The trace object containing the posterior samples.
    y_max (float): Maximum value of the target variable for scaling.
    target_column (str): Name of the target variable column.
    start_date (str): Optional start date for the x-axis range.
    end_date (str): Optional end date for the x-axis range.
    """
    # Reset the index to access the dates for plotting
    df_reset = df.reset_index()

    # Sample from the posterior predictive distribution
    posterior_predictive = pm.sample_posterior_predictive(trace, model=model)

    # Extract posterior predictive samples
    posterior_predictive_samples = az.extract(posterior_predictive, group="posterior_predictive", num_samples=100)["likelihood"]

    # Extract posterior trend lines
    posterior_trend_lines = az.extract(trace, group="posterior", num_samples=100)["trend"]

    # Extract posterior seasonality
    posterior_seasonality = az.extract(trace, group="posterior", num_samples=100)["seasonality"]

    # Create Plotly figure for posterior predictive distribution
    fig = go.Figure()

    # Plot posterior predictive samples
    for sample in posterior_predictive_samples.T:
        fig.add_trace(go.Scatter(x=df.index, y=sample * y_max, mode='lines', line=dict(color='blue', width=1), opacity=0.01))

    # Add scatter plot with original data
    fig.add_trace(go.Scatter(x=df_reset['DATE'], y=df_reset[target_column], mode='lines+markers', marker=dict(color='black', size=5), name='Data'))

    # Update layout for the figure
    fig.update_layout(title="Posterior Predictive", xaxis_title="Date", yaxis_title=target_column)

    # Set x-axis range if start_date and end_date are provided
    if start_date and end_date:
        fig.update_xaxes(range=[start_date, end_date])

    # Show the figure
    fig.show()

    # Create Plotly figure for posterior trend lines
    fig_trend = go.Figure()

    # Plot posterior trend lines
    for trend_line in posterior_trend_lines.T:
        fig_trend.add_trace(go.Scatter(x=df.index, y=trend_line * y_max, mode='lines', line=dict(color='blue', width=1), opacity=0.01))

    # Add scatter plot with original data
    fig_trend.add_trace(go.Scatter(x=df_reset['DATE'], y=df_reset[target_column], mode='lines+markers', marker=dict(color='black', size=5), name='Data'))

    # Update layout for the figure
    fig_trend.update_layout(title="Posterior Trend Lines", xaxis_title="Date", yaxis_title=target_column)

    # Set x-axis range if start_date and end_date are provided
    if start_date and end_date:
        fig_trend.update_xaxes(range=[start_date, end_date])

    # Show the figure
    fig_trend.show()

    # Create Plotly figure for posterior seasonality
    fig_seasonality = go.Figure()

    # Plot posterior seasonality samples for the first "period" points
    for season in posterior_seasonality.T:
        fig_seasonality.add_trace(go.Scatter(x=df.index[:period], y=season[:period] * 100, mode='lines', line=dict(color='blue', width=1), opacity=0.05))

    # Update layout for the figure
    fig_seasonality.update_layout(title="Posterior Seasonality", xaxis_title="Date", yaxis_title="Percent change")

    # Show the figure
    fig_seasonality.show()


In [8]:
def normalize_time_index(df, index_column='DATE'):
    """
    Normalize the time index to be between 0 and 1 based on the number of days since 1900-01-01.
    
    Parameters:
    df (pd.DataFrame): DataFrame with a datetime index or a column with datetime values.
    index_column (str): Name of the column to use as the time index if the DataFrame doesn't have a datetime index.
    
    Returns:
    np.ndarray: Normalized time index.
    """
    if index_column in df.columns:
        df[index_column] = pd.to_datetime(df[index_column])
        df.set_index(index_column, inplace=True)
        
    # Convert the index to the number of days since 1900-01-01
    t = (df.index - pd.Timestamp("1900-01-01")).days.to_numpy()

    # Find the minimum and maximum values of t
    t_min = np.min(t)
    t_max = np.max(t)

    # Normalize t to be between 0 and 1
    t_normalized = (t - t_min) / (t_max - t_min)
    
    return t_normalized

In [9]:
def generate_fourier_features(t_normalized, n_order):
    """
    Generate Fourier features for seasonality modeling.
    
    Parameters:
    t (pd.Series or np.ndarray): Normalized time variable (must be between 0 and 1).
    n_order (int): The number of orders to generate for the Fourier series.
    
    Returns:
    pd.DataFrame: DataFrame containing the Fourier features.
    """
    # Ensure t is a pandas Series to retain the index
    if isinstance(t_normalized, np.ndarray):
        t_normalized = pd.Series(t_normalized)
    
    t_numeric = t_normalized.values  # Convert Series to numpy array for calculations

    # Generate a DataFrame with Fourier features (sine and cosine terms)
    fourier_features = pd.DataFrame(
        {
            # Generate sine and cosine terms for each order
            f"{func}_order_{order}": getattr(np, func)(2 * np.pi * t_numeric * order)
            for order in range(1, n_order + 1)  # Loop over the range of orders
            for func in ("sin", "cos")          # Loop over sine and cosine functions
        },
        index=t_normalized.index  # Use the same index as t for the DataFrame
    )
    
    return fourier_features

In [10]:
def normalize_target_variable(df, target_column):
    """
    Normalize the target variable to be between 0 and 1.
    
    Parameters:
    df (pd.DataFrame): DataFrame containing the target variable.
    target_column (str): Name of the target variable column.
    
    Returns:
    np.ndarray: Normalized target variable.
    float: Maximum value of the target variable (used for denormalization).
    """
    # Extract the target variable as a numpy array
    y = df[target_column].to_numpy()
    
    # Find the maximum value of the target variable
    y_max = np.max(y)
    
    # Normalize the target variable to be between 0 and 1
    y_normalized = y / y_max
    
    return y_normalized, y_max

<a id='toc5_'></a>

## Load the sample dataset

The sample dataset used here is provided by the ValidMind library. To be able to use it, you'll need to import the dataset and load it into a pandas [DataFrame](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html), a two-dimensional tabular data structure that makes use of rows and columns:

In [11]:
from validmind.datasets.regression import fred_deposits as demo_dataset

deposits_df, fedfunds_df, tb3ms_df, gs10_df, gs30_df = demo_dataset.load_data()

target_column = demo_dataset.target_column
period = demo_dataset.period


In [12]:
# Create a line graph using Plotly
fig = px.line(deposits_df, x=deposits_df.index, y='DPSACBW027NBOG', title='Original Data')
fig.update_layout(xaxis_title='Date', yaxis_title='Value')
fig.show()

In [13]:
from prophet import Prophet

# Prepare the data for Prophet
deposits_df.reset_index(inplace=True)  # Move the index to a column
deposits_df.rename(columns={'DATE': 'ds', target_column: 'y'}, inplace=True)  # Rename columns
deposits_df['ds'] = pd.to_datetime(deposits_df['ds'])  # Ensure 'ds' is datetime

In [14]:
deposits_df

Unnamed: 0,ds,y
0,2010-01-01,7692.715500
1,2010-02-01,7937.060000
2,2010-03-01,8145.245102
3,2010-04-01,8211.242000
4,2010-05-01,8089.598702
...,...,...
163,2023-08-01,17042.309400
164,2023-09-01,16855.737048
165,2023-10-01,16765.614500
166,2023-11-01,16931.713698


In [15]:
# Initialize and fit the Prophet model
prophet_model = Prophet(yearly_seasonality=True)
prophet_model.fit(deposits_df)

# Create a DataFrame for future dates and make a forecast
future = prophet_model.make_future_dataframe(periods=12, freq='M')
forecast = prophet_model.predict(future)

# Extract trend and seasonality components and align with observed data length
aligned_forecast = forecast.iloc[:len(deposits_df)]
trend = aligned_forecast['trend'].values
seasonality = aligned_forecast[['yearly']].sum(axis=1).values  # Sum all seasonal components

11:57:46 - cmdstanpy - INFO - Chain [1] start processing
11:57:46 - cmdstanpy - INFO - Chain [1] done processing


In [17]:
trend[:5], seasonality[:5]

(array([7395.53589437, 7443.55720775, 7486.93129725, 7534.95261063,
        7581.42484938]),
 array([ 65.68894016, 258.14078919, 525.79765312, 608.28325307,
        466.15526279]))

In [18]:
import plotly.graph_objects as go

# Actual values
actual = deposits_df.set_index('ds')['y']

# Predicted values from Prophet
predicted = aligned_forecast.set_index('ds')['yhat']

# Create the plot for Forecast vs. Observed
fig_forecast = go.Figure()

# Add actual values to the plot
fig_forecast.add_trace(go.Scatter(x=actual.index, y=actual, mode='lines', name='Actual'))

# Add predicted values to the plot
fig_forecast.add_trace(go.Scatter(x=predicted.index, y=predicted, mode='lines', name='Prophet Predicted'))

fig_forecast.update_layout(
    title='Forecast vs Observed Values',
    xaxis_title='Date',
    yaxis_title='Value',
    legend_title='Legend'
)

fig_forecast.show()

In [20]:
# Create the plot for Trend Component
fig_trend = go.Figure()

# Add trend component to the plot
fig_trend.add_trace(go.Scatter(x=deposits_df["ds"], y=trend, mode='lines+markers', name='Trend'))

fig_trend.update_layout(
    title='Trend Component',
    xaxis_title='Date',
    yaxis_title='Value',
    legend_title='Component'
)

fig_trend.show()

In [21]:
# Create the plot for Seasonality Component
fig_seasonality = go.Figure()

# Add seasonality component to the plot
fig_seasonality.add_trace(go.Scatter(x=deposits_df["ds"], y=seasonality, mode='lines+markers', name='Seasonality'))

fig_seasonality.update_layout(
    title='Seasonality Component',
    xaxis_title='Date',
    yaxis_title='Value',
    legend_title='Component'
)

fig_seasonality.show()

In [27]:
deposits_df

Unnamed: 0,ds,y
0,2010-01-01,7692.715500
1,2010-02-01,7937.060000
2,2010-03-01,8145.245102
3,2010-04-01,8211.242000
4,2010-05-01,8089.598702
...,...,...
163,2023-08-01,17042.309400
164,2023-09-01,16855.737048
165,2023-10-01,16765.614500
166,2023-11-01,16931.713698


In [28]:
import pymc as pm
import arviz as az
import plotly.express as px
import plotly.graph_objects as go

y_normalized, y_max = normalize_target_variable(deposits_df, "y")

In [22]:


with pm.Model() as model:
    # Priors for unknown model parameters
    sigma = pm.HalfNormal('sigma', sigma=0.1)
    
    # Convert the NumPy arrays to PyMC variables
    trend_shared = pm.Data("trend_shared", trend)
    seasonality_shared = pm.Data("seasonality_shared", seasonality)
    
    # Introduce uncertainty around the Prophet trend and seasonality
    trend = pm.Normal('trend', mu=trend_shared, sigma=10)
    seasonality = pm.Normal('seasonality', mu=seasonality_shared, sigma=10)
    
    # Define the mean of the observations
    mu = trend + seasonality
    
    # Likelihood (sampling distribution) of observations
    pm.Normal('likelihood', mu=mu, sigma=sigma, observed=y_normalized)
    
    # Sample prior predictive
    prior = pm.sample_prior_predictive()

Sampling: [likelihood, seasonality, sigma, trend]


In [23]:
  # Extract prior predictive samples
prior_predictive_samples = az.extract(prior, group="prior_predictive", num_samples=100)["likelihood"]

# Extract prior trend lines
prior_trend_lines = az.extract(prior, group="prior", num_samples=100)["trend"]

# Extract prior seasonality
prior_seasonality = az.extract(prior, group="prior", num_samples=100)["seasonality"]

In [26]:
# Create Plotly figures for prior predictive distribution
fig = go.Figure()

for sample in prior_trend_lines.T:
    fig.add_trace(go.Scatter(x=deposits_df['ds'], y=sample, mode='lines', line=dict(color='blue', width=1), opacity=0.05))

fig.add_trace(go.Scatter(x=deposits_df['ds'], y=deposits_df["y"], mode='lines+markers', marker=dict(color='black', size=5), name='Data'))

fig.update_layout(title="Prior Predictive", xaxis_title="Date", yaxis_title=target_column)

fig.show()

In [None]:
import arviz as az
import pandas as pd
import plotly.graph_objects as go

# Extract the posterior predictive mean
# Ensure the length of posterior predictive samples matches the length of observed data
num_samples = len(deposits_df['y'])
posterior_predictive_samples = az.extract(posterior_predictive, group="posterior_predictive", num_samples=num_samples)["y_obs"]
posterior_mean = posterior_predictive_samples.mean(axis=0)

# Ensure the posterior mean matches the length of the observed data
posterior_mean_series = pd.Series(posterior_mean, index=deposits_df['ds'])

# Actual values
actual = deposits_df.set_index('ds')['y']

# Predicted values from Prophet
predicted = aligned_forecast.set_index('ds')['yhat']

# Create the plot
fig = go.Figure()

# Add actual values to the plot
fig.add_trace(go.Scatter(x=actual.index, y=actual, mode='lines', name='Actual'))

# Add predicted values to the plot
fig.add_trace(go.Scatter(x=predicted.index, y=predicted, mode='lines', name='Prophet Predicted'))

# Add the posterior predictive mean values to the plot
fig.add_trace(go.Scatter(
    x=posterior_mean_series.index,
    y=posterior_mean_series,
    mode='lines',
    name='Posterior Predictive Mean',
    line=dict(dash='dash')
))

fig.update_layout(
    title='Actual vs Predicted Values',
    xaxis_title='Date',
    yaxis_title='Value',
    legend_title='Legend'
)

fig.show()


In [None]:
import pymc as pm
import numpy as np

# Extract the observed values
y_observed = deposits_df['y'].values

# Define the PyMC model
with pm.Model() as model:
    # Priors for unknown model parameters
    sigma = pm.HalfCauchy('sigma', beta=0.5, initval=1.0)
    
    # Use Prophet trend and seasonality as priors
    mu = pm.Normal('mu', mu=trend + seasonality, sigma=sigma, shape=len(trend + seasonality))
    
    # Likelihood (sampling distribution) of observations
    y_obs = pm.Normal('y_obs', mu=mu, sigma=sigma, observed=y_observed)
    
    # Inference
    trace = pm.sample(1000, tune=1000, return_inferencedata=True)

# Posterior predictions
with model:
    posterior_predictive = pm.sample_posterior_predictive(trace)



In [None]:
import arviz as az
import plotly.graph_objects as go

# Extract the posterior predictive mean
# Ensure the length of posterior predictive samples matches the length of observed data
num_samples = len(deposits_df['y'])
posterior_predictive_samples = az.extract(posterior_predictive, group="posterior_predictive", num_samples=num_samples)["y_obs"]
posterior_mean = posterior_predictive_samples.mean(axis=0)

# Ensure the posterior mean matches the length of the observed data
posterior_mean_series = pd.Series(posterior_mean, index=deposits_df['ds'])

# Actual values
actual = deposits_df.set_index('ds')['y']

# Predicted values from Prophet
predicted = aligned_forecast.set_index('ds')['yhat']

# Create the plot
fig = go.Figure()

# Add actual values to the plot
fig.add_trace(go.Scatter(x=actual.index, y=actual, mode='lines', name='Actual'))

# Add predicted values to the plot
fig.add_trace(go.Scatter(x=predicted.index, y=predicted, mode='lines', name='Prophet Predicted'))

# Add the posterior predictive mean values to the plot
fig.add_trace(go.Scatter(
    x=posterior_mean_series.index,
    y=posterior_mean_series,
    mode='lines',
    name='Posterior Predictive Mean',
    line=dict(dash='dash')
))

fig.update_layout(
    title='Actual vs Predicted Values',
    xaxis_title='Date',
    yaxis_title='Value',
    legend_title='Legend'
)

fig.show()


In [None]:
import arviz as az
import pandas as pd
import plotly.graph_objects as go

# Extract the posterior predictive mean
posterior_predictive_samples = az.extract(posterior_predictive, group="posterior_predictive", num_samples=100)["y_obs"]
posterior_mean = posterior_predictive_samples.mean(axis=0)

# Ensure the posterior mean matches the length of the observed data
posterior_mean_series = pd.Series(posterior_mean, index=aligned_forecast['ds'])

# Actual values
actual = deposits_df.set_index('ds')['y']

# Predicted values from Prophet
predicted = aligned_forecast.set_index('ds')['yhat']

# Create the plot
fig = go.Figure()

# Add actual values to the plot
fig.add_trace(go.Scatter(x=actual.index, y=actual, mode='lines', name='Actual'))

# Add predicted values to the plot
fig.add_trace(go.Scatter(x=predicted.index, y=predicted, mode='lines', name='Prophet Predicted'))

# Add the posterior predictive mean values to the plot
fig.add_trace(go.Scatter(
    x=posterior_mean_series.index,
    y=posterior_mean_series,
    mode='lines',
    name='Posterior Predictive Mean',
    line=dict(dash='dash')
))

fig.update_layout(
    title='Actual vs Predicted Values',
    xaxis_title='Date',
    yaxis_title='Value',
    legend_title='Legend'
)

fig.show()


In [None]:
# Extract the exact number of posterior predictive samples needed
num_samples = len(forecast['ds'])
posterior_predictive_samples = az.extract(posterior_predictive, group="posterior_predictive", num_samples=num_samples)["y_obs"]

# Calculate the mean of the posterior predictive samples
posterior_mean = np.mean(posterior_predictive_samples, axis=0)

In [None]:
# Ensure the posterior mean matches the length of the forecast
posterior_mean_series = pd.Series(posterior_mean, index=forecast['ds'])

In [None]:
# Actual values
actual = deposits_df.set_index('ds')['y']

# Predicted values from Prophet
predicted = forecast.set_index('ds')['yhat']

# Create the plot
fig = go.Figure()

# Add actual values to the plot
fig.add_trace(go.Scatter(x=actual.index, y=actual, mode='lines', name='Actual'))

# Add predicted values to the plot
fig.add_trace(go.Scatter(x=predicted.index, y=predicted, mode='lines', name='Prophet Predicted'))

# Add the posterior predictive mean values to the plot
fig.add_trace(go.Scatter(
    x=posterior_mean_series.index,
    y=posterior_mean_series,
    mode='lines',
    name='Posterior Predictive Mean',
    line=dict(dash='dash')
))

fig.update_layout(
    title='Actual vs Predicted Values',
    xaxis_title='Date',
    yaxis_title='Value',
    legend_title='Legend'
)

fig.show()


In [None]:
import plotly.graph_objects as go

# Actual values
actual = deposits_df.set_index('ds')['y']

# Predicted values
predicted = forecast.set_index('ds')['yhat']

# Create the plot
fig = go.Figure()

# Add actual values to the plot
fig.add_trace(go.Scatter(x=actual.index, y=actual, mode='lines', name='Actual'))

# Add predicted values to the plot
fig.add_trace(go.Scatter(x=predicted.index, y=predicted, mode='lines', name='Predicted'))

# Add the posterior predictive intervals
fig.add_trace(go.Scatter(
    x=forecast['ds'],
    y=posterior_predictive['y_obs'].mean(axis=0),
    mode='lines',
    name='Posterior Mean',
    line=dict(dash='dash')
))

fig.update_layout(
    title='Actual vs Predicted Values',
    xaxis_title='Date',
    yaxis_title='Value',
    legend_title='Legend'
)

fig.show()


In [None]:
posterior_samples = posterior_predictive['y_obs'].values

In [None]:
import pymc as pm
import numpy as np
import arviz as az

# Prepare the data for PyMC
y = deposits_df['DPSACBW027NBOG'].values
x = np.arange(len(y))

# Define the model
with pm.Model() as model:
    # Priors for unknown model parameters
    alpha = pm.Normal('alpha', mu=0, sigma=10)
    beta = pm.Normal('beta', mu=0, sigma=10)
    sigma = pm.HalfNormal('sigma', sigma=1)
    
    # Seasonality component
    seasonality = pm.Normal('seasonality', mu=0, sigma=10, shape=12)
    
    # Expected value of outcome
    mu = alpha + beta * x + seasonality[x % 12]
    
    # Likelihood (sampling distribution) of observations
    y_obs = pm.Normal('y_obs', mu=mu, sigma=sigma, observed=y)
    
    # Posterior distribution
    trace = pm.sample(1000, return_inferencedata=True)

# Generate posterior predictive samples
with model:
    posterior_predictive = pm.sample_posterior_predictive(trace, var_names=["y_obs"])

In [None]:
# Inspect the keys in the posterior_predictive object
posterior_predictive["posterior_predictive"]["y_obs"]

In [None]:
# Create a DataFrame for plotting
plot_df = pd.DataFrame({
    'Date': deposits_df.index,
    'Actual': y,
    'Predicted': posterior_predictive_mean,
    'Lower Bound': np.percentile(posterior_predictive["y_obs"], 2.5, axis=0),
    'Upper Bound': np.percentile(posterior_predictive["y_obs"], 97.5, axis=0)
})

# Plot the actual data and the posterior predictive mean using Plotly
fig = px.line(plot_df, x='Date', y='Actual', title='Posterior Predictive vs Actual Data')
fig.add_scatter(x=plot_df['Date'], y=plot_df['Predicted'], mode='lines', name='Posterior Predictive Mean')
fig.add_scatter(x=plot_df['Date'], y=plot_df['Lower Bound'], mode='lines', name='Lower Bound', line=dict(dash='dash'))
fig.add_scatter(x=plot_df['Date'], y=plot_df['Upper Bound'], mode='lines', name='Upper Bound', line=dict(dash='dash'))
fig.update_layout(xaxis_title='Date', yaxis_title='Value')
fig.show()

In [None]:
from prophet import Prophet

# Prepare the data for Prophet
data = deposits_df.reset_index().rename(columns={'DATE': 'ds', 'DPSACBW027NBOG': 'y'})

# Initialize the Prophet model
model = Prophet()

# Fit the model to the data
model.fit(data)

# Make future predictions
future = model.make_future_dataframe(periods=12, freq='MS')  # Forecasting 12 months into the future
forecast = model.predict(future)

In [None]:
# Create the Plotly figure
fig = go.Figure()

# Plot the historical data
fig.add_trace(go.Scatter(x=data['ds'], y=data['y'], mode='markers', name='Observed Data'))

# Plot the forecasted data
fig.add_trace(go.Scatter(x=forecast['ds'], y=forecast['yhat'], mode='lines', name='Forecast'))

# Plot the forecast components (confidence intervals)
fig.add_trace(go.Scatter(
    x=forecast['ds'],
    y=forecast['yhat_upper'],
    mode='lines',
    line=dict(width=0),
    showlegend=False
))
fig.add_trace(go.Scatter(
    x=forecast['ds'],
    y=forecast['yhat_lower'],
    mode='lines',
    line=dict(width=0),
    fill='tonexty',
    fillcolor='rgba(0, 100, 80, 0.2)',
    name='Confidence Interval'
))

# Update layout
fig.update_layout(
    title='Deposits Forecast',
    xaxis_title='Date',
    yaxis_title='Deposits',
    legend_title='Legend',
    showlegend=True
)

fig.show()

In [None]:
import numpy as np
import pandas as pd
import pymc as pm
import plotly.graph_objs as go
import arviz as az

# Prepare the data for PyMC
y = deposits_df['DPSACBW027NBOG'].values
x = np.arange(len(y))

# Define the model
with pm.Model() as model:
    # Priors for unknown model parameters
    alpha = pm.Normal('alpha', mu=0, sigma=10)
    beta = pm.Normal('beta', mu=0, sigma=10)
    sigma = pm.HalfNormal('sigma', sigma=1)
    
    # Expected value of outcome
    mu = alpha + beta * x
    
    # Likelihood (sampling distribution) of observations
    y_obs = pm.Normal('y_obs', mu=mu, sigma=sigma, observed=y)
    
    # Posterior distribution
    trace = pm.sample(1000, return_inferencedata=True)


In [None]:
# Extract the trace data
trace_data = az.extract(trace, group='posterior', num_samples=100)

# Create the Plotly figure for trace plots
fig_trace_alpha = go.Figure()
fig_trace_beta = go.Figure()
fig_trace_sigma = go.Figure()

# Plot the trace for alpha
fig_trace_alpha.add_trace(go.Scatter(y=trace_data['alpha'], mode='lines', name='Trace for alpha'))
fig_trace_alpha.update_layout(title="Trace Plot for Alpha", xaxis_title="Sample", yaxis_title="Alpha")

# Plot the trace for beta
fig_trace_beta.add_trace(go.Scatter(y=trace_data['beta'], mode='lines', name='Trace for beta'))
fig_trace_beta.update_layout(title="Trace Plot for Beta", xaxis_title="Sample", yaxis_title="Beta")

# Plot the trace for sigma
fig_trace_sigma.add_trace(go.Scatter(y=trace_data['sigma'], mode='lines', name='Trace for sigma'))
fig_trace_sigma.update_layout(title="Trace Plot for Sigma", xaxis_title="Sample", yaxis_title="Sigma")

# Show the figures
fig_trace_alpha.show()
fig_trace_beta.show()
fig_trace_sigma.show()

## Explore the data

In [None]:
vm_raw_deposits_ds = vm.init_dataset(
    input_id="raw_deposits_ds",
    dataset=deposits_df,
    __log=False,
)

vm_raw_fedfunds_ds = vm.init_dataset(
    input_id="raw_fedfunds_ds",
    dataset=fedfunds_df,
    __log=False,
)

vm_raw_tb3ms_ds = vm.init_dataset(
    input_id="raw_tb3ms_ds",
    dataset=tb3ms_df,
    __log=False,
)

vm_raw_gs10_ds = vm.init_dataset(
    input_id="raw_gs10_ds",
    dataset=gs10_df,
    __log=False,
)

vm_raw_gs30_ds = vm.init_dataset(
    input_id="raw_gs30_ds",
    dataset=gs30_df,
    __log=False,
)


In [None]:
vm.tests.run_test(
    test_id="validmind.data_validation.TimeSeriesDatasetAnalysis",
    inputs={
        "datasets": [vm_raw_deposits_ds, vm_raw_fedfunds_ds, vm_raw_tb3ms_ds, vm_raw_gs10_ds, vm_raw_gs30_ds],
    }
)

In [None]:
import numpy as np
import pandas as pd
import plotly.graph_objects as go
from statsmodels.tsa.seasonal import seasonal_decompose

def plot_seasonality_decomposition(df, target_column, period=10):
    """
    Plot the seasonal decomposition of a given time series using statsmodels and Plotly.
    
    Parameters:
    df (pd.DataFrame): DataFrame containing the time series data.
    target_column (str): Name of the target variable column.
    period (int): The number of observations per cycle (e.g., 52 for weekly data).
    """
    # Ensure the index is a datetime index
    if not pd.api.types.is_datetime64_any_dtype(df.index):
        raise ValueError("DataFrame index must be a datetime index")

    # Perform seasonal decomposition
    result = seasonal_decompose(df[target_column], model='additive', period=period)
    
    # Plot Trend component
    fig_trend = go.Figure()
    fig_trend.add_trace(go.Scatter(x=df.index, y=result.trend, mode='lines', name='Trend'))
    fig_trend.update_layout(title="Trend Component", xaxis_title="Date", yaxis_title="Value")
    fig_trend.show()

    # Plot Seasonal component
    fig_seasonal = go.Figure()
    fig_seasonal.add_trace(go.Scatter(x=df.index, y=result.seasonal, mode='lines', name='Seasonal'))
    fig_seasonal.update_layout(title="Seasonal Component", xaxis_title="Date", yaxis_title="Value")
    fig_seasonal.show()

    # Plot Residual component
    fig_residual = go.Figure()
    fig_residual.add_trace(go.Scatter(x=df.index, y=result.resid, mode='lines', name='Residual'))
    fig_residual.update_layout(title="Residual Component", xaxis_title="Date", yaxis_title="Value")
    fig_residual.show()

    # Plot Original data
    fig_original = go.Figure()
    fig_original.add_trace(go.Scatter(x=df.index, y=df[target_column], mode='lines', name='Original', line=dict(color='black', width=2)))
    fig_original.update_layout(title="Original Data", xaxis_title="Date", yaxis_title="Value")
    fig_original.show()

# Assuming `deposits_df` is your DataFrame and `DPSACBW027NBOG` is the target column
plot_seasonality_decomposition(
    df=deposits_df, 
    target_column=target_column, 
    period=period
)


## Seasonality modelling

### Scale the data

First, we’ll scale time to be between 0 and 1:

In [None]:
#t_normalized = normalize_time_index(df=deposits_df)

Next, for the target variable, we divide by the maximum. We do this, rather than standardising, so that the sign of the observations in unchanged - this will be necessary for the seasonality component to work properly later on.

In [None]:
#y_normalized, y_max = normalize_target_variable(df=deposits_df, target_column=target_column)

### Seasonality modelling with Prophet

**Fourier Features for Seasonality Modeling**

Fourier features are used to capture and model periodic patterns in time series data. By decomposing the time series into sine and cosine terms at different frequencies, Fourier features enable the model to detect and represent seasonal variations effectively.

Each order of the Fourier series corresponds to a different frequency component, with higher orders capturing more oscillations within a given period. For instance, the first-order terms represent the basic seasonal cycle, while higher-order terms capture more complex, shorter-term fluctuations. This approach allows the model to flexibly fit and predict periodic behavior, making it particularly useful for time series with repeating seasonal patterns.


**Seasonality Model Equation**

The seasonality model can be represented by the following equation:

$$ \text{Deposits} \sim \alpha + \beta \cdot \text{time} + \text{seasonality} $$

In this equation, the trend component ($\alpha + \beta \cdot \text{time}$) captures the overall increase or decrease in deposits over time, while the seasonality component (modeled by the Fourier features) adjusts this trend to account for recurring seasonal effects. The combination of these components enables the model to represent both the long-term trend and the periodic fluctuations in the deposits data.

In [None]:
# Generate Fourier features for seasonality
#n_order = period
#fourier_features = generate_fourier_features(t_normalized=t_normalized, n_order=period)
#fourier_features.head()

In [None]:
# Ensure the index is a DatetimeIndex
deposits_df.index = pd.to_datetime(deposits_df.index)

# Calculate the periods in years since a reference date (e.g., 1900-01-01)
periods = (deposits_df.index - pd.Timestamp("1900-01-01")).days / 365.25

# Generate Fourier features
n_order = 10
fourier_features = pd.DataFrame(
    {
        f"{func}_order_{order}": getattr(np, func)(2 * np.pi * periods * order)
        for order in range(1, n_order + 1)
        for func in ("sin", "cos")
    }
)
fourier_features.head()

In [None]:
t = (deposits_df.index - pd.Timestamp("1900-01-01")).days
t_min = np.min(t)
t_max = np.max(t)
t = (t - t_min) / (t_max - t_min)
t = t.to_numpy()

In [None]:
y = deposits_df[target_column].to_numpy()
y_max = np.max(y)
y = y / y_max

In [None]:
def seasonality_model(t, y, fourier_features, alpha_mu=0, alpha_sigma=0.5, beta_mu=0, beta_sigma=0.5, sigma_sigma=0.1, fourier_sigma=10, samples=500, tune=500, seasonality_type='additive'):
    """
    Build and fit a Bayesian seasonality model using PyMC3.

    Parameters:
    - t: Time index (numpy array)
    - y: Target variable (numpy array)
    - fourier_features: Fourier features for seasonality (pandas DataFrame)
    - alpha_mu: Mean of the prior for alpha (intercept)
    - alpha_sigma: Standard deviation of the prior for alpha (intercept)
    - beta_mu: Mean of the prior for beta (slope)
    - beta_sigma: Standard deviation of the prior for beta (slope)
    - sigma_sigma: Standard deviation of the prior for sigma (standard deviation)
    - fourier_sigma: Standard deviation of the prior for Fourier coefficients
    - samples: Number of samples for posterior sampling
    - tune: Number of tuning steps for posterior sampling
    - seasonality_type: Type of seasonality ('additive' or 'multiplicative')

    Returns:
    - model: PyMC3 model object
    - trace: PyMC3 trace object
    - prior_predictive: Prior predictive samples
    """
    # Define the coordinates for the model (dimension for Fourier features)
    coords = {"fourier_features": np.arange(fourier_features.shape[1])}
    
    # Build the PyMC3 model
    with pm.Model(check_bounds=False, coords=coords) as model:
        # Define the prior for alpha (intercept) as a normal distribution
        alpha = pm.Normal("alpha", mu=alpha_mu, sigma=alpha_sigma)
        
        # Define the prior for beta (slope) as a normal distribution
        beta = pm.Normal("beta", mu=beta_mu, sigma=beta_sigma)
        
        # Define the prior for sigma (standard deviation) as a half-normal distribution
        sigma = pm.HalfNormal("sigma", sigma=sigma_sigma)
        
        # Define the prior for Fourier coefficients as a normal distribution
        beta_fourier = pm.Normal("beta_fourier", mu=0, sigma=fourier_sigma, dims="fourier_features")
        
        # Compute the seasonality component as a deterministic variable
        seasonality = pm.Deterministic("seasonality", pm.math.dot(beta_fourier, fourier_features.to_numpy().T))
        
        # Compute the trend component as a deterministic variable
        trend = pm.Deterministic("trend", alpha + beta * t)
        
        # Define the expected value of the observed variable
        if seasonality_type == 'multiplicative':
            mu = trend * (1 + seasonality)
        elif seasonality_type == 'additive':
            mu = trend + seasonality
        else:
            raise ValueError("seasonality_type must be either 'additive' or 'multiplicative'")
        
        # Define the likelihood of the observed data
        pm.Normal("likelihood", mu=mu, sigma=sigma, observed=y)

        # Sample from the prior predictive distribution
        prior_predictive = pm.sample_prior_predictive()
        
        # Sample from the posterior distribution
        trace = pm.sample(samples, tune=tune, return_inferencedata=True)
    
    # Return the model, the trace, and the prior predictive samples
    return model, trace, prior_predictive

In [None]:
model, trace, prior_predictive = seasonality_model(
    t, 
    y, 
    fourier_features, 
    alpha_mu=0, 
    alpha_sigma=0.5, 
    beta_mu=0, 
    beta_sigma=0.5, 
    sigma_sigma=0.1, 
    fourier_sigma=0.1, 
    samples=500, 
    tune=500, 
    seasonality_type='additive'
)
   

In [None]:
# Extract prior predictive samples
prior_predictive_samples = az.extract(prior_predictive, group="prior_predictive", num_samples=100)["likelihood"]

# Extract prior trend lines
prior_trend_lines = az.extract(prior_predictive, group="prior", num_samples=100)["trend"]

# Extract prior seasonality
prior_seasonality = az.extract(prior_predictive, group="prior", num_samples=100)["seasonality"]


fig_prior_predictive = go.Figure()
for sample in prior_predictive_samples.T:
    fig_prior_predictive.add_trace(go.Scatter(x=deposits_df.index, y=sample * y_max, mode='lines', line=dict(color='blue', width=1), opacity=0.05))

fig_prior_predictive.add_trace(go.Scatter(x=deposits_df.index, y=deposits_df[target_column], mode='markers', marker=dict(color='black', size=5), name='Data'))
fig_prior_predictive.update_layout(title="Prior Predictive", xaxis_title="Date", yaxis_title="Deposits")
fig_prior_predictive.show()

# Prior trend lines
fig_prior_trend = go.Figure()
for trend_line in prior_trend_lines.T:
    fig_prior_trend.add_trace(go.Scatter(x=deposits_df.index, y=trend_line * y_max, mode='lines', line=dict(color='blue', width=1), opacity=0.05))

fig_prior_trend.add_trace(go.Scatter(x=deposits_df.index, y=deposits_df[target_column], mode='markers', marker=dict(color='black', size=5), name='Data'))
fig_prior_trend.update_layout(title="Prior Trend Lines", xaxis_title="Date", yaxis_title="Deposits")
fig_prior_trend.show()

# Prior seasonality
fig_prior_seasonality = go.Figure()
for season in prior_seasonality.T:
    fig_prior_seasonality.add_trace(go.Scatter(x=deposits_df.index[:12], y=season[:12] * 100, mode='lines', line=dict(color='blue', width=1), opacity=0.05))

fig_prior_seasonality.update_layout(title="Prior Seasonality", xaxis_title="Date", yaxis_title="Percent Change")
fig_prior_seasonality.show()

In [None]:
with model:
    linear_seasonality_trace = pm.sample(return_inferencedata=True)
    linear_seasonality_posterior = pm.sample_posterior_predictive(trace=linear_seasonality_trace)

In [None]:
# Extract posterior predictive samples
posterior_predictive_samples = az.extract(linear_seasonality_posterior, group="posterior_predictive", num_samples=100)["likelihood"]

# Extract posterior trend lines
posterior_trend_lines = az.extract(linear_seasonality_trace, group="posterior", num_samples=100)["trend"]

# Extract posterior seasonality
posterior_seasonality = az.extract(linear_seasonality_trace, group="posterior", num_samples=100)["seasonality"]

# Posterior predictive distribution
# ---------------------------------
fig_posterior_predictive = go.Figure()
for sample in posterior_predictive_samples.T:
    fig_posterior_predictive.add_trace(go.Scatter(x=deposits_df.index, y=sample * y_max, mode='lines', line=dict(color='blue', width=1), opacity=0.05))

fig_posterior_predictive.add_trace(go.Scatter(x=deposits_df.index, y=deposits_df[target_column], mode='markers', marker=dict(color='black', size=5), name='Data'))
fig_posterior_predictive.update_layout(title="Posterior Predictive", xaxis_title="Date", yaxis_title="Deposits")
fig_posterior_predictive.show()


# Posterior trend lines
# ---------------------
fig_posterior_trend = go.Figure()
for trend_line in posterior_trend_lines.T:
    fig_posterior_trend.add_trace(go.Scatter(x=deposits_df.index, y=trend_line * y_max, mode='lines', line=dict(color='blue', width=1), opacity=0.05))

fig_posterior_trend.add_trace(go.Scatter(x=deposits_df.index, y=deposits_df[target_column], mode='markers', marker=dict(color='black', size=5), name='Data'))
fig_posterior_trend.update_layout(title="Posterior Trend Lines", xaxis_title="Date", yaxis_title="Deposits")
fig_posterior_trend.show()


# Posterior seasonality
# ---------------------
fig_posterior_seasonality = go.Figure()
for season in posterior_seasonality.T:
    fig_posterior_seasonality.add_trace(go.Scatter(x=deposits_df.index[:12], y=season[:12] * 100, mode='lines', line=dict(color='blue', width=1), opacity=0.05))

fig_posterior_seasonality.update_layout(title="Posterior Seasonality", xaxis_title="Date", yaxis_title="Percent Change")
fig_posterior_seasonality.show()

In [None]:
fig, ax = plt.subplots(nrows=3, ncols=1, sharex=False, figsize=(8, 6))
ax[0].plot(
    df["Month"],
    az.extract_dataset(linear_seasonality_posterior, group="posterior_predictive", num_samples=100)[
        "likelihood"
    ]
    * y_max,
    color="blue",
    alpha=0.05,
)
df.plot.scatter(x="Month", y="#Passengers", color="k", ax=ax[0])
ax[0].set_title("Posterior predictive")
ax[1].plot(
    df["Month"],
    az.extract_dataset(linear_seasonality_trace, group="posterior", num_samples=100)["trend"] * y_max,
    color="blue",
    alpha=0.05,
)
df.plot.scatter(x="Month", y="#Passengers", color="k", ax=ax[1])
ax[1].set_title("Posterior trend lines")
ax[2].plot(
    df["Month"].iloc[:12],
    az.extract_dataset(linear_seasonality_trace, group="posterior", num_samples=100)["seasonality"][
        :12
    ]
    * 100,
    color="blue",
    alpha=0.05,
)
ax[2].set_title("Posterior seasonality")
ax[2].set_ylabel("Percent change")
formatter = mdates.DateFormatter("%b")
ax[2].xaxis.set_major_formatter(formatter);

In [None]:
fig, ax = plt.subplots(nrows=3, ncols=1, sharex=False, figsize=(8, 6))
ax[0].plot(
    deposits_df.index,
    az.extract_dataset(linear_seasonality_prior, group="prior_predictive", num_samples=100)[
        "likelihood"
    ]
    * y_max,
    color="blue",
    alpha=0.05,
)
df.plot.scatter(x="Month", y="#Passengers", color="k", ax=ax[0])
ax[0].set_title("Prior predictive")
ax[1].plot(
    deposits_df.index,
    az.extract_dataset(linear_seasonality_prior, group="prior", num_samples=100)["trend"] * y_max,
    color="blue",
    alpha=0.05,
)
df.plot.scatter(x="Month", y="#Passengers", color="k", ax=ax[1])
ax[1].set_title("Prior trend lines")
ax[2].plot(
    deposits_df.index.iloc[:12],
    az.extract_dataset(linear_seasonality_prior, group="prior", num_samples=100)["seasonality"][:12]
    * 100,
    color="blue",
    alpha=0.05,
)
ax[2].set_title("Prior seasonality")
ax[2].set_ylabel("Percent change")
formatter = mdates.DateFormatter("%b")
ax[2].xaxis.set_major_formatter(formatter);

In [None]:
# Build and fit the model with seasonality with reduced samples and tuning steps
model, trace, prior_predictive = seasonality_model(
    t_normalized, y_normalized, fourier_features,
    alpha_mu=0, alpha_sigma=0.1, beta_mu=0, beta_sigma=0.1, sigma_sigma=0.1, fourier_sigma=10,
    samples=500, tune=500
)

In [None]:
# Define the start and end dates for the plot
start_date = '2010-01-01'
end_date = '2023-01-01'

# Plot the prior predictive results using Plotly
plot_prior_predictive_seasonality(deposits_df, prior_predictive, y_max, target_column, start_date, end_date)

# Plot the posterior predictive results using Plotly
plot_posterior_predictive_seasonality(deposits_df, model, trace, y_max, target_column, period=period, start_date=start_date, end_date=end_date)

# Archive

### Linear trend

The general form of a linear regression equation for modeling a linear trend is:

$$ Y = \alpha + \beta \cdot t $$

Where:
- $Y$ is the dependent variable (`DPSACBW027NBOG`).
- $\alpha$ is the intercept.
- $\beta$ is the coefficient of the time variable.
- $t$ is the normalized time variable.

Given the variables, the specific linear regression model equation is:

$$ \text{DPSACBW027NBOG} = \alpha + \beta \cdot t $$

This model captures the linear trend in the deposits over time, with $\alpha$ representing the baseline level of deposits when $t$ is zero, and $\beta$ representing the rate of change in deposits over time.

In [None]:
import pymc as pm
import arviz as az
import numpy as np

def build_and_fit_model(t, y, alpha_mu=0, alpha_sigma=1, beta_mu=0, beta_sigma=1, sigma_sigma=1):
    # Build the linear trend model using PyMC with specified priors
    with pm.Model(check_bounds=False) as linear:
        alpha = pm.Normal("alpha", mu=alpha_mu, sigma=alpha_sigma)  # Prior for alpha
        beta = pm.Normal("beta", mu=beta_mu, sigma=beta_sigma)      # Prior for beta
        sigma = pm.HalfNormal("sigma", sigma=sigma_sigma)           # Prior for sigma

        # Define the deterministic trend
        trend = pm.Deterministic("trend", alpha + beta * t)

        pm.Normal("likelihood", mu=trend, sigma=sigma, observed=y)

        # Sample from the prior predictive distribution
        linear_prior = pm.sample_prior_predictive()
        # Sample from the posterior distribution
        trace = pm.sample(2000, tune=1000, return_inferencedata=True)
    
    return linear, trace, linear_prior

In [None]:
# Using mean for resampling
preprocessed_deposits_df_mean = deposits_df.resample('M').mean()

# Using sum for resampling
preprocessed_deposits_df_sum = deposits_df.resample('M').sum()

# Using a custom function (e.g., take the last value of each month)
preprocessed_deposits_df_last = deposits_df.resample('M').last()

In [None]:
import plotly.graph_objects as go

fig = go.Figure()

# Plot original data
fig.add_trace(go.Scatter(x=deposits_df.index, y=deposits_df['DPSACBW027NBOG'], mode='lines', name='Weekly Data'))

# Plot mean resampled data
fig.add_trace(go.Scatter(x=preprocessed_deposits_df_mean.index, y=preprocessed_deposits_df_mean['DPSACBW027NBOG'], mode='lines', name='Monthly Mean'))

# Plot sum resampled data
fig.add_trace(go.Scatter(x=preprocessed_deposits_df_sum.index, y=preprocessed_deposits_df_sum['DPSACBW027NBOG'], mode='lines', name='Monthly Sum'))

# Plot last value resampled data
fig.add_trace(go.Scatter(x=preprocessed_deposits_df_last.index, y=preprocessed_deposits_df_last['DPSACBW027NBOG'], mode='lines', name='Monthly Last'))

fig.update_layout(title='Deposits Time Series Resampling Comparison', xaxis_title='Date', yaxis_title='Deposits')
fig.show()


In [None]:
vm.tests.run_test(
    test_id="validmind.data_validation.TimeSeriesTimeDifference",
    inputs={
        "datasets": [vm_raw_deposits_ds, vm_raw_fedfunds_ds, vm_raw_tb3ms_ds, vm_raw_gs10_ds, vm_raw_gs30_ds],
    }
)

## Preprocess data

Convert the deposit target variable `DPSACBW027NBOG` from weekly to monthly:

In [None]:
# Ensure all DataFrames have datetime index
for df in [deposits_df, fedfunds_df, tb3ms_df, gs10_df, gs30_df]:
    df.index = pd.to_datetime(df.index)

# Resample deposits_df to monthly start frequency ("MS")
deposits_monthly = deposits_df.resample('MS').mean()

# Ensure other DataFrames are already in monthly start frequency ("MS")
fedfunds_monthly = fedfunds_df.resample('MS').mean()
tb3ms_monthly = tb3ms_df.resample('MS').mean()
gs10_monthly = gs10_df.resample('MS').mean()
gs30_monthly = gs30_df.resample('MS').mean()

# Define the common start and end dates
start_date = max(deposits_monthly.index.min(), fedfunds_monthly.index.min(), tb3ms_monthly.index.min(), gs10_monthly.index.min(), gs30_monthly.index.min())
end_date = min(deposits_monthly.index.max(), fedfunds_monthly.index.max(), tb3ms_monthly.index.max(), gs10_monthly.index.max(), gs30_monthly.index.max())

# Truncate all DataFrames to the common date range
deposits_monthly = deposits_monthly[start_date:end_date]
fedfunds_monthly = fedfunds_monthly[start_date:end_date]
tb3ms_monthly = tb3ms_monthly[start_date:end_date]
gs10_monthly = gs10_monthly[start_date:end_date]
gs30_monthly = gs30_monthly[start_date:end_date]

# Combine all DataFrames into a single DataFrame
preprocessed_df = pd.concat([deposits_monthly, fedfunds_monthly, tb3ms_monthly, gs10_monthly, gs30_monthly], axis=1)

# Rename columns to reflect the variables
preprocessed_df.columns = ['DPSACBW027NBOG', 'FEDFUNDS', 'TB3MS', 'GS10', 'GS30']

# Select historical data for empirical analysis
preprocessed_df = preprocessed_df['2010-01-01':'2023-01-01']

preprocessed_df.head()

In [None]:
vm_preprocessed_ds = vm.init_dataset(
    input_id="preprocessed_ds",
    dataset=preprocessed_df,
    __log=False,
)

In [None]:
vm.tests.run_test(
    test_id="validmind.data_validation.TimeSeriesDatasetAnalysis",
    inputs={
        "datasets": [vm_preprocessed_ds],
    }
)

## Scale the data

First, we’ll scale time to be between 0 and 1:

In [None]:
# Convert the index to the number of days since 1900-01-01
t = (preprocessed_df.index - pd.Timestamp("1900-01-01")).days.to_numpy()

# Find the minimum and maximum values of t
t_min = np.min(t)
t_max = np.max(t)

# Normalize t to be between 0 and 1
t = (t - t_min) / (t_max - t_min)

Next, for the target variable, we divide by the maximum. We do this, rather than standardising, so that the sign of the observations in unchanged - this will be necessary for the seasonality component to work properly later on.

In [None]:
y = preprocessed_df[target_column].to_numpy()
y_max = np.max(y)
y = y / y_max

In [None]:
# Normalize the time index
t = preprocess_time_index(preprocessed_df)

# Generate Fourier features for seasonality
n_order = 10
fourier_features = generate_fourier_features(t, n_order)

# Build and fit the model with seasonality with reduced samples and tuning steps
model, trace, prior_predictive = seasonality_model(
    t, 
    y, 
    fourier_features,
    alpha_mu=0, 
    alpha_sigma=0.5, 
    beta_mu=0, 
    beta_sigma=0.5, 
    sigma_sigma=0.1, 
    fourier_sigma=10,
    samples=500, 
    tune=500
)

In [None]:
import plotly.graph_objects as go

def plot_prior_predictive(preprocessed_df, linear_prior, y_max, target_column, start_date=None, end_date=None):
    preprocessed_df_reset = preprocessed_df.reset_index()

    # Extract prior predictive samples
    prior_predictive_samples = az.extract_dataset(linear_prior, group="prior_predictive", num_samples=100)["likelihood"]

    # Extract prior trend lines
    prior_trend_lines = az.extract_dataset(linear_prior, group="prior", num_samples=100)["trend"]

    # Create Plotly figures
    fig = go.Figure()

    # Plot prior predictive distribution
    for sample in prior_predictive_samples.T:  # Transpose to match the shape
        fig.add_trace(go.Scatter(x=preprocessed_df.index, y=sample * y_max, mode='lines', line=dict(color='blue', width=1), opacity=0.05))

    # Add scatter plot with lines and smaller dots
    fig.add_trace(go.Scatter(x=preprocessed_df_reset['DATE'], y=preprocessed_df_reset[target_column], mode='lines+markers', marker=dict(color='black', size=5), name='Data'))

    fig.update_layout(title="Prior Predictive", xaxis_title="Date", yaxis_title=target_column)

    # Set x-axis range if start_date and end_date are provided
    if start_date is not None and end_date is not None:
        fig.update_xaxes(range=[start_date, end_date])

    fig.show()

    # Create a new Plotly figure for trend lines
    fig_trend = go.Figure()

    # Plot prior trend lines
    for trend_line in prior_trend_lines.T:  # Transpose to match the shape
        fig_trend.add_trace(go.Scatter(x=preprocessed_df.index, y=trend_line * y_max, mode='lines', line=dict(color='blue', width=1), opacity=0.05))

    # Add scatter plot with lines and smaller dots
    fig_trend.add_trace(go.Scatter(x=preprocessed_df_reset['DATE'], y=preprocessed_df_reset[target_column], mode='lines+markers', marker=dict(color='black', size=5), name='Data'))

    fig_trend.update_layout(title="Prior Trend Lines", xaxis_title="Date", yaxis_title=target_column)

    # Set x-axis range if start_date and end_date are provided
    if start_date is not None and end_date is not None:
        fig_trend.update_xaxes(range=[start_date, end_date])

    fig_trend.show()


In [None]:
import plotly.graph_objects as go

def plot_posterior_predictive(preprocessed_df, linear, trace, y_max, target_column, start_date=None, end_date=None):
    preprocessed_df_reset = preprocessed_df.reset_index()

    # Sample from the posterior predictive distribution
    posterior_predictive = pm.sample_posterior_predictive(trace, model=linear)

    # Extract posterior predictive samples
    posterior_predictive_samples = az.extract_dataset(posterior_predictive, group="posterior_predictive", num_samples=100)["likelihood"]

    # Extract posterior trend lines
    posterior_trend_lines = az.extract_dataset(trace, group="posterior", num_samples=100)["trend"]

    # Create Plotly figures
    fig = go.Figure()

    # Plot posterior predictive distribution
    for sample in posterior_predictive_samples.T:  # Transpose to match the shape
        fig.add_trace(go.Scatter(x=preprocessed_df.index, y=sample * y_max, mode='lines', line=dict(color='blue', width=1), opacity=0.01))

    # Add scatter plot with lines and smaller dots
    fig.add_trace(go.Scatter(x=preprocessed_df_reset['DATE'], y=preprocessed_df_reset[target_column], mode='lines+markers', marker=dict(color='black', size=5), name='Data'))

    fig.update_layout(title="Posterior Predictive", xaxis_title="Date", yaxis_title=target_column)

    # Set x-axis range if start_date and end_date are provided
    if start_date is not None and end_date is not None:
        fig.update_xaxes(range=[start_date, end_date])

    fig.show()

    # Create a new Plotly figure for trend lines
    fig_trend = go.Figure()

    # Plot posterior trend lines
    for trend_line in posterior_trend_lines.T:  # Transpose to match the shape
        fig_trend.add_trace(go.Scatter(x=preprocessed_df.index, y=trend_line * y_max, mode='lines', line=dict(color='blue', width=1), opacity=0.01))

    # Add scatter plot with lines and smaller dots
    fig_trend.add_trace(go.Scatter(x=preprocessed_df_reset['DATE'], y=preprocessed_df_reset[target_column], mode='lines+markers', marker=dict(color='black', size=5), name='Data'))

    fig_trend.update_layout(title="Posterior Trend Lines", xaxis_title="Date", yaxis_title=target_column)

    # Set x-axis range if start_date and end_date are provided
    if start_date is not None and end_date is not None:
        fig_trend.update_xaxes(range=[start_date, end_date])

    fig_trend.show()


In [None]:
run=False
if run:

    # Build and fit the model with specified priors
    linear, trace, linear_prior = build_and_fit_model(t, y, alpha_mu=0, alpha_sigma=1, beta_mu=0, beta_sigma=1, sigma_sigma=1)

    # Define the start and end dates for the plot
    start_date = '2010-01-01'
    end_date = '2023-01-01'

    # Plot the prior predictive results using Plotly
    plot_prior_predictive(preprocessed_df, linear_prior, y_max, target_column, start_date, end_date)

    # Plot the posterior predictive results using Plotly
    plot_posterior_predictive(preprocessed_df, linear, trace, y_max, target_column, start_date, end_date)

Before going on to anything more complicated, let’s try conditioning on the data and doing a posterior predictive check:

## Model seasonality with Prophet

$ \text{Deposits} \sim (\alpha + \beta \cdot \text{time})(1 + \text{seasonality}) $

In [None]:
def build_and_fit_model_with_seasonality(t, y, fourier_features, alpha_mu=0, alpha_sigma=0.5, beta_mu=0, beta_sigma=0.5, sigma_sigma=0.1, fourier_sigma=10, samples=500, tune=500):
    coords = {"fourier_features": np.arange(fourier_features.shape[1])}
    with pm.Model(check_bounds=False, coords=coords) as model:
        alpha = pm.Normal("alpha", mu=alpha_mu, sigma=alpha_sigma)
        beta = pm.Normal("beta", mu=beta_mu, sigma=beta_sigma)
        sigma = pm.HalfNormal("sigma", sigma=sigma_sigma)
        beta_fourier = pm.Normal("beta_fourier", mu=0, sigma=fourier_sigma, dims="fourier_features")
        seasonality = pm.Deterministic("seasonality", pm.math.dot(fourier_features.to_numpy(), beta_fourier))
        trend = pm.Deterministic("trend", alpha + beta * t)
        mu = trend * (1 + seasonality)
        pm.Normal("likelihood", mu=mu, sigma=sigma, observed=y)

        prior_predictive = pm.sample_prior_predictive()
        trace = pm.sample(samples, tune=tune, return_inferencedata=True)
    
    return model, trace, prior_predictive


In [None]:
import plotly.graph_objects as go

def plot_prior_predictive_seasonality(preprocessed_df, prior_predictive, y_max, target_column, start_date=None, end_date=None):
    preprocessed_df_reset = preprocessed_df.reset_index()

    # Extract prior predictive samples
    prior_predictive_samples = az.extract_dataset(prior_predictive, group="prior_predictive", num_samples=100)["likelihood"]

    # Extract prior trend lines
    prior_trend_lines = az.extract_dataset(prior_predictive, group="prior", num_samples=100)["trend"]

    # Extract prior seasonality
    prior_seasonality = az.extract_dataset(prior_predictive, group="prior", num_samples=100)["seasonality"]

    # Create Plotly figures for prior predictive distribution
    fig = go.Figure()

    for sample in prior_predictive_samples.T:
        fig.add_trace(go.Scatter(x=preprocessed_df.index, y=sample * y_max, mode='lines', line=dict(color='blue', width=1), opacity=0.05))

    fig.add_trace(go.Scatter(x=preprocessed_df_reset['DATE'], y=preprocessed_df_reset[target_column], mode='lines+markers', marker=dict(color='black', size=5), name='Data'))

    fig.update_layout(title="Prior Predictive", xaxis_title="Date", yaxis_title=target_column)

    if start_date and end_date:
        fig.update_xaxes(range=[start_date, end_date])

    fig.show()

    # Create Plotly figures for prior trend lines
    fig_trend = go.Figure()

    for trend_line in prior_trend_lines.T:
        fig_trend.add_trace(go.Scatter(x=preprocessed_df.index, y=trend_line * y_max, mode='lines', line=dict(color='blue', width=1), opacity=0.05))

    fig_trend.add_trace(go.Scatter(x=preprocessed_df_reset['DATE'], y=preprocessed_df_reset[target_column], mode='lines+markers', marker=dict(color='black', size=5), name='Data'))

    fig_trend.update_layout(title="Prior Trend Lines", xaxis_title="Date", yaxis_title=target_column)

    if start_date and end_date:
        fig_trend.update_xaxes(range=[start_date, end_date])

    fig_trend.show()

    # Create Plotly figures for prior seasonality
    fig_seasonality = go.Figure()

    for season in prior_seasonality.T:
        fig_seasonality.add_trace(go.Scatter(x=preprocessed_df.index[:12], y=season[:12] * 100, mode='lines', line=dict(color='blue', width=1), opacity=0.05))

    fig_seasonality.update_layout(title="Prior Seasonality", xaxis_title="Date", yaxis_title="Percent change")

    fig_seasonality.show()


In [None]:
import plotly.graph_objects as go

def plot_posterior_predictive_seasonality(preprocessed_df, model, trace, y_max, target_column, start_date=None, end_date=None):
    preprocessed_df_reset = preprocessed_df.reset_index()

    # Sample from the posterior predictive distribution
    posterior_predictive = pm.sample_posterior_predictive(trace, model=model)

    # Extract posterior predictive samples
    posterior_predictive_samples = az.extract_dataset(posterior_predictive, group="posterior_predictive", num_samples=100)["likelihood"]

    # Extract posterior trend lines
    posterior_trend_lines = az.extract_dataset(trace, group="posterior", num_samples=100)["trend"]

    # Extract posterior seasonality
    posterior_seasonality = az.extract_dataset(trace, group="posterior", num_samples=100)["seasonality"]

    # Create Plotly figures
    fig = go.Figure()

    for sample in posterior_predictive_samples.T:
        fig.add_trace(go.Scatter(x=preprocessed_df.index, y=sample * y_max, mode='lines', line=dict(color='blue', width=1), opacity=0.01))

    fig.add_trace(go.Scatter(x=preprocessed_df_reset['DATE'], y=preprocessed_df_reset[target_column], mode='lines+markers', marker=dict(color='black', size=5), name='Data'))

    fig.update_layout(title="Posterior Predictive", xaxis_title="Date", yaxis_title=target_column)

    if start_date and end_date:
        fig.update_xaxes(range=[start_date, end_date])

    fig.show()

    fig_trend = go.Figure()

    for trend_line in posterior_trend_lines.T:
        fig_trend.add_trace(go.Scatter(x=preprocessed_df.index, y=trend_line * y_max, mode='lines', line=dict(color='blue', width=1), opacity=0.01))

    fig_trend.add_trace(go.Scatter(x=preprocessed_df_reset['DATE'], y=preprocessed_df_reset[target_column], mode='lines+markers', marker=dict(color='black', size=5), name='Data'))

    fig_trend.update_layout(title="Posterior Trend Lines", xaxis_title="Date", yaxis_title=target_column)

    if start_date and end_date:
        fig_trend.update_xaxes(range=[start_date, end_date])

    fig_trend.show()

    fig_seasonality = go.Figure()

    for season in posterior_seasonality.T:
        fig_seasonality.add_trace(go.Scatter(x=preprocessed_df.index[:12], y=season[:12] * 100, mode='lines', line=dict(color='blue', width=1), opacity=0.05))

    fig_seasonality.update_layout(title="Posterior Seasonality", xaxis_title="Date", yaxis_title="Percent change")

    fig_seasonality.show()


In [None]:
def preprocess_time_index(df):
    t = (df.index - pd.Timestamp("1900-01-01")).days / 365.25  # time in years
    t = pd.Series(t, index=df.index)  # Ensure t is a Series with the same index
    print(f"Type of t in preprocess_time_index: {type(t)}")
    print(f"First few elements of t in preprocess_time_index: {t[:5]}")
    return t


In [None]:
def generate_fourier_features(t, n_order):
    print(f"Type of t in generate_fourier_features: {type(t)}")
    print(f"First few elements of t in generate_fourier_features: {t[:5]}")

    t_numeric = t.values if isinstance(t, pd.Series) else t
    print(f"Type of t_numeric in generate_fourier_features: {type(t_numeric)}")
    print(f"First few elements of t_numeric in generate_fourier_features: {t_numeric[:5]}")

    fourier_features = pd.DataFrame(
        {
            f"{func}_order_{order}": getattr(np, func)(2 * np.pi * t_numeric * order)
            for order in range(1, n_order + 1)
            for func in ("sin", "cos")
        },
        index=t.index
    )
    print(f"Generated Fourier features:\n{fourier_features.head()}")
    return fourier_features


In [None]:
# Normalize the time index
t = preprocess_time_index(preprocessed_df)

# Generate Fourier features for seasonality
n_order = 10
fourier_features = generate_fourier_features(t, n_order)

# Build and fit the model with seasonality with reduced samples and tuning steps
model, trace, prior_predictive = build_and_fit_model_with_seasonality(
    t, y, fourier_features,
    alpha_mu=0, alpha_sigma=0.5, beta_mu=0, beta_sigma=0.5, sigma_sigma=0.1, fourier_sigma=10,
    samples=500, tune=500
)
# Define the start and end dates for the plot
start_date = '2010-01-01'
end_date = '2023-01-01'

# Plot the prior predictive results using Plotly
plot_prior_predictive_seasonality(preprocessed_df, prior_predictive, y_max, target_column, start_date, end_date)

# Plot the posterior predictive results using Plotly
plot_posterior_predictive_seasonality(preprocessed_df, model, trace, y_max, target_column, start_date, end_date)

## Seasonality Analysis

The seasonal plot shows the deviation from the trend, so the units represent the magnitude of the seasonal effect in billions of U.S. dollars. 

The y-axis of the seasonal plot ranges from approximately -20 to +30, which suggests that the seasonal fluctuations in deposit volumes can vary by tens of billions of dollars throughout the year. These fluctuations are significant and should be considered when analyzing the data for predictive modeling or other financial analyses.

In [None]:
# Decompose the time series
result = seasonal_decompose(deposits_df['DPSACBW027NBOG'], model='additive')

# Plot the seasonal component
seasonal = result.seasonal
fig = px.line(seasonal, title='Seasonal Component of Deposits')
fig.show()

In [None]:
vm.tests.run_test(
    test_id="validmind.data_validation.SeasonalDecompose",
    inputs={
        "dataset": vm_raw_deposits_ds
    }
)

In [None]:
vm.tests.run_test(
    test_id="validmind.data_validation.AutoSeasonality",
    inputs={
        "dataset": vm_raw_deposits_ds
    }
)

Convert the deposit target variable `DPSACBW027NBOG` from weekly to monthly.

In [None]:
# Ensure all DataFrames have datetime index
for df in [deposits_df, fedfunds_df, tb3ms_df, gs10_df, gs30_df]:
    df.index = pd.to_datetime(df.index)

In [None]:
# Resample deposits_df to monthly start frequency ("MS")
deposits_monthly = deposits_df.resample('MS').sum()

In [None]:
# Ensure other DataFrames are already in monthly start frequency ("MS")
fedfunds_monthly = fedfunds_df.resample('MS').mean()
tb3ms_monthly = tb3ms_df.resample('MS').mean()
gs10_monthly = gs10_df.resample('MS').mean()
gs30_monthly = gs30_df.resample('MS').mean()

In [None]:
# Define the common start and end dates
start_date = max(deposits_monthly.index.min(), fedfunds_monthly.index.min(), tb3ms_monthly.index.min(), gs10_monthly.index.min(), gs30_monthly.index.min())
end_date = min(deposits_monthly.index.max(), fedfunds_monthly.index.max(), tb3ms_monthly.index.max(), gs10_monthly.index.max(), gs30_monthly.index.max())

In [None]:
# Truncate all DataFrames to the common date range
deposits_monthly = deposits_monthly[start_date:end_date]
fedfunds_monthly = fedfunds_monthly[start_date:end_date]
tb3ms_monthly = tb3ms_monthly[start_date:end_date]
gs10_monthly = gs10_monthly[start_date:end_date]
gs30_monthly = gs30_monthly[start_date:end_date]

In [None]:
# Combine all DataFrames into a single DataFrame
preprocessed_df = pd.concat([deposits_monthly, fedfunds_monthly, tb3ms_monthly, gs10_monthly, gs30_monthly], axis=1)

# Rename columns to reflect the variables
preprocessed_df.columns = ['DPSACBW027NBOG', 'FEDFUNDS', 'TB3MS', 'GS10', 'GS30']
preprocessed_df.index.name

In [None]:
preprocessed_df.head()

## Scale the data

First, we’ll scale time to be between 0 and 1:

In [None]:
t = (df["Month"] - pd.Timestamp("1900-01-01")).dt.days.to_numpy()
t_min = np.min(t)
t_max = np.max(t)
t = (t - t_min) / (t_max - t_min)

## Fit Bayesian Model

In [None]:
# Separate the predictors (X) and the target variable (y)
X = preprocessed_df[['FEDFUNDS', 'TB3MS', 'GS10', 'GS30']]
y = preprocessed_df['DPSACBW027NBOG']

Define the Bayesian model

In [None]:
import pymc as pm

with pm.Model() as model:
    # Priors for unknown model parameters
    alpha = pm.Normal('alpha', mu=0, sigma=10)
    beta = pm.Normal('beta', mu=0, sigma=10, shape=X.shape[1])
    sigma = pm.HalfNormal('sigma', sigma=1)

    # Expected value of outcome
    mu = alpha + pm.math.dot(X, beta)

    # Likelihood (sampling distribution) of observations
    y_obs = pm.Normal('y_obs', mu=mu, sigma=sigma, observed=y)

    # Inference: sample from the posterior
    trace = pm.sample(2000, tune=1000, return_inferencedata=True)

# Print summary of the trace to ensure sufficient samples
print(pm.summary(trace))

Generate posterior predictive samples:

In [None]:
import numpy as np

# Generate posterior predictive samples
with model:
    posterior_predictive = pm.sample_posterior_predictive(trace, random_seed=42)

# Inspect the keys in the 'posterior_predictive' object to find the variable names
print("Available groups in posterior_predictive:", posterior_predictive.groups())
print("Keys in 'posterior_predictive' group:", list(posterior_predictive.posterior_predictive.keys()))

# Use the correct key for accessing the simulated deposits
simulated_deposits_key = 'y_obs' if 'y_obs' in posterior_predictive.posterior_predictive else list(posterior_predictive.posterior_predictive.keys())[0]
simulated_deposits = posterior_predictive.posterior_predictive[simulated_deposits_key]

# Check the shape of simulated_deposits
print(f"Shape of simulated_deposits: {simulated_deposits.shape}")

# Determine the number of available samples
n_samples = simulated_deposits.shape[0]

# Set the number of paths to extract, ensuring it does not exceed the available samples
n_paths = min(10, n_samples)
print(f"Number of available samples: {n_samples}, Number of paths to extract: {n_paths}")

# Example: extract 10 different paths from the simulated distribution
simulated_paths = simulated_deposits[np.random.choice(n_samples, n_paths, replace=False), :, 0]

# Ensure the data length matches the index length
expected_length = len(preprocessed_df.index)
if simulated_paths.shape[1] != expected_length:
    print(f"Truncating data to match index length: {expected_length}")
    simulated_paths = simulated_paths[:, :expected_length]

# Check if the length of the index matches the length of the data
if simulated_paths.shape[1] != len(preprocessed_df.index):
    raise ValueError(f"Index length {len(preprocessed_df.index)} does not match data length {simulated_paths.shape[1]}.")

# Convert the simulated paths to a DataFrame
simulated_paths_df = pd.DataFrame(simulated_paths.T, index=preprocessed_df.index, columns=[f'Simulation_{i+1}' for i in range(n_paths)])

print("Simulated Corporate Deposits:")
print(simulated_paths_df.head())