In [0]:
%pip install -r requirements.txt

In [0]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from plotly.offline import plot
from plotly.subplots import make_subplots
from statsmodels.tsa.seasonal import STL
import plotly.graph_objects as go
from functools import reduce 
import glob

## Exploratory Data Analysis for Total Household Deposits

This notebook documents the exploratory data analysis (EDA) conducted on the original set of 49 variables. Following the initial analysis and subsequent feature reduction, the majority of these variables were excluded from further processing. Only a small subset of relevant features was retained and carried forward into the feature selection and model development stages (see `00_data_processing` for exact list of remaining features).

In [0]:
# read data (note this uses legacy data which has the original 49 variables initially used for exploration)
filepath = '../data/processed/legacy_combined_df_for_eda.csv'
df = pd.read_csv(filepath)
df = df[(df.date >= '2004-01') & (df.date<'2025-04')]
df

## Granger Causality and Lagged Correlation Analysis

This section investigates the temporal relationships between the target variable and the predictor features using two complementary approaches: the Granger causality test and lagged correlation analysis. 

Since the raw time series data can include overlapping trends, seasonality, and residual noise, STL decomposition was first applied to separate each series into its **trend**, **seasonal**, and **residual** components. Granger causality tests were then performed on each component individually to identify features that have statistically significant predictive power for the target variable across various time lags (up to 12 months). Features were considered causally related if any lag produced a p-value ≤ 0.05.

In parallel, lagged Pearson correlations were computed to identify features that are strongly correlated with the target variable at specific lags. This provides additional insight into the temporal alignment of feature-target relationships, even if those relationships are not strictly causal.

By analyzing each STL component separately, this approach helps to uncover both predictive and contemporaneous relationships that may not be visible in the raw data due to noise or confounding temporal patterns.



In [0]:
def decompose_stl(df, date_col, target_col, predictor_cols, period=12):
    df = df.sort_values(date_col).copy()
    df[date_col] = pd.to_datetime(df[date_col])
    df.set_index(date_col, inplace=True)

    all_cols = [target_col] + predictor_cols

    trend_data = {}
    seasonal_data = {}
    residual_data = {}

    for col in all_cols:
        series = pd.to_numeric(df[col], errors='coerce')
        stl = STL(series, period=period).fit()

        trend_data[col] = stl.trend
        seasonal_data[col] = stl.seasonal
        residual_data[col] = stl.resid

    # Create three separate DataFrames each for the STL decompositions
    trend_df = pd.DataFrame(trend_data, index=df.index)
    seasonal_df = pd.DataFrame(seasonal_data, index=df.index)
    residual_df = pd.DataFrame(residual_data, index=df.index)

    return trend_df, seasonal_df, residual_df

def compute_df_correlation(df, date_col = 'date'):
    return df.corr()

def plot_target_correlation(df, target_col, type = 'trend'):
    df = df[[target_col]].copy()
    df[f'abs_{type}_correlation'] = df[target_col].abs()
    df.sort_values(f'abs_{type}_correlation', ascending=False, inplace=True)
    # plot heatmap
    plt.figure(figsize=(10, 15)) 
    sns.heatmap(df[[f'abs_{type}_correlation']], annot=True, cmap='coolwarm', vmin = 0, vmax = 1)
    plt.title(f'Correlation with Household Deposit ({type})')

## Granger Causality Test
from statsmodels.tsa.stattools import grangercausalitytests

def granger_causality_tests(df, target_col, predictor_cols, maxlag=12):
    rows = []
    for pred in predictor_cols:
        test_result = grangercausalitytests(df[[target_col, pred]], maxlag=maxlag, verbose=False)
        # Extract all p-values for the predictor across lags
        lag_pvals = {lag: test_result[lag][0]['ssr_ftest'][1] for lag in test_result}
        
        # Find lags with significant p-values
        significant_lags = [lag for lag, pval in lag_pvals.items() if pval <= 0.05]
        if significant_lags:
            # Find minimum p-value and corresponding lag
            min_lag = min(significant_lags, key=lambda lag: lag_pvals[lag])
            min_pval = lag_pvals[min_lag]
            rows.append({
                'predictor': pred,
                'min_pvalue_lag': min_lag,
                'min_pvalue': min_pval,
                'significant_lags': significant_lags
            })
    
    results_df = pd.DataFrame(rows).sort_values('min_pvalue').reset_index(drop=True)
    return results_df

## Correlations With Lags
def lagged_correlations(df, target_col, predictor_cols, maxlag=12):
    results = []
    target = df[target_col]
    
    for pred in predictor_cols:
        lag_corrs = {lag: target.corr(df[pred].shift(lag)) for lag in range(maxlag + 1)}
        sig_lags = [lag for lag, c in lag_corrs.items() if pd.notna(c)]
        
        if sig_lags:
            best_lag = max(sig_lags, key=lambda lag: abs(lag_corrs[lag]))
            results.append({
                'predictor': pred,
                'max_corr_lag': best_lag,
                'max_abs_corr': abs(lag_corrs[best_lag]),
                'significant_lags': sig_lags
            })
        else:
            results.append({'predictor': pred, 'max_corr_lag': None, 'max_abs_corr': None, 'significant_lags': []})
    
    return pd.DataFrame(results).sort_values(by='max_abs_corr', key=lambda x: x.abs(), ascending=False).reset_index(drop=True)



def plot_multi_plots(df, date_col, target_col, predictor_cols,  ncols=3, figsize_per_plot=(6, 3)):
    nplots = len(predictor_cols)
    nrows = (nplots + ncols - 1) // ncols
    
    fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(figsize_per_plot[0]*ncols, figsize_per_plot[1]*nrows), sharex=True)
    axes = axes.flatten() if nplots > 1 else [axes]

    for i, pred in enumerate(predictor_cols):
        ax1 = axes[i]
        ax2 = ax1.twinx()  # create a second y-axis

        # Plot target on primary y-axis
        ax1.plot(df[date_col], df[target_col], color='blue', label=target_col)
        ax1.set_ylabel(target_col, color='blue')
        ax1.tick_params(axis='y', colors='blue')

        # Plot predictor on secondary y-axis
        ax2.plot(df[date_col], df[pred], color='red', label=pred)
        ax2.set_ylabel(pred, color='red')
        ax2.tick_params(axis='y', colors='red')

        ax1.set_title(f"{target_col} vs {pred}")
        ax1.grid(True)

        # Combine legends from both axes
        lines_1, labels_1 = ax1.get_legend_handles_labels()
        lines_2, labels_2 = ax2.get_legend_handles_labels()
        ax1.legend(lines_1 + lines_2, labels_1 + labels_2, loc='upper left')

    # Hide any unused subplots
    for j in range(i+1, len(axes)):
        axes[j].set_visible(False)

    plt.tight_layout()
    plt.show()


In [0]:
# decompose target and variables into STL components (Seasonal, Trend and Residual)
target_col = 'household_deposits'
date_col = 'date'
predictor_cols = [col for col in df.columns if col not in ['date', 'household_deposits']]
tdf, sdf, rdf = decompose_stl(df, date_col, target_col, predictor_cols, period=12)

In [0]:
target_col = 'household_deposits'
date_col = 'date'
predictor_cols = [col for col in df.columns if col not in ['date', 'household_deposits']]

# test on stl decomposition - trend
trend_granger = granger_causality_tests(tdf, target_col, predictor_cols)
trend_lagcorr = lagged_correlations(tdf, target_col, predictor_cols)

# test on stl decomposition - seasonality
seasonal_granger = granger_causality_tests(sdf, target_col, predictor_cols)
seasonal_lagcorr = lagged_correlations(sdf, target_col, predictor_cols)

# test on stl decomposition - residual
residual_granger = granger_causality_tests(rdf, target_col, predictor_cols)
residual_lagcorr = lagged_correlations(rdf, target_col, predictor_cols)

## Plot lagged correlations
plot_target_correlation(trend_lagcorr.set_index('predictor'), 'max_abs_corr', 'trend')
plot_target_correlation(seasonal_lagcorr.set_index('predictor'), 'max_abs_corr', 'seasonal')

In [0]:
def prepare_component(lagcorr_df, granger_df, component):
    df = pd.merge(
        lagcorr_df, granger_df,
        on='predictor', how='outer',
        suffixes=('_corr', '_granger')
    )
    df.columns = ['predictor'] + [
        f"{component}_{col}" if col != 'predictor' else col
        for col in df.columns[1:]
    ]
    return df

# Prepare each component
trend_df = prepare_component(trend_lagcorr, trend_granger, 'trend')
seasonal_df = prepare_component(seasonal_lagcorr, seasonal_granger, 'seasonal')
residual_df = prepare_component(residual_lagcorr, residual_granger, 'residual')

# Merge all components on 'predictor'
pivoted_result = reduce(
    lambda left, right: pd.merge(left, right, on='predictor', how='outer'),
    [trend_df, seasonal_df, residual_df]
).sort_values('predictor').reset_index(drop=True)

top_predictors = pivoted_result[['predictor','trend_max_abs_corr','trend_min_pvalue','trend_max_corr_lag','trend_min_pvalue_lag','seasonal_max_abs_corr','seasonal_min_pvalue','seasonal_max_corr_lag','seasonal_min_pvalue_lag']].sort_values(by='trend_min_pvalue', ascending=True).dropna()

# Reduction of features based on the criteria that the features have to have a correlation with target > 0.9 and a significant p-value (<=0.05) from the granger test
signifnicant_corr_granger = top_predictors[(top_predictors.trend_max_abs_corr>0.9) &(top_predictors.seasonal_min_pvalue <=0.05)]
signifnicant_corr_granger.reset_index(drop = True)

The criteria above to select features with correlation > 0.9 with the target as well as a significant p-value from the granger test (<=0.05) eliminates almost half of the features where only 26 remain as plotted below.

In [0]:
# plot of all trend cols
plot_multi_plots(tdf.reset_index(), 'date', target_col, signifnicant_corr_granger.predictor, ncols=3, figsize_per_plot=(6, 3))

## Manual Filtering of Irregular Features

Through visual analysis, the features `card_apparel`, `card_services`, `card_hospitality`, `production_based_gdp`, `card_durables`, `total_card_transactions`, `card_motor_vehicles_excl_fuel`, `term_deposit_rate_google`, `card_non_retail_excl_services`, `hpi` were further removed due to irregular behaviour during the COVID-19 period around 20-21. 

In [0]:
excluded_cols = ['card_apparel','card_services','card_hospitality','production_based_gdp','card_durables','total_card_transactions','card_motor_vehicles_excl_fuel','term_deposit_rate_google','card_non_retail_excl_services','hpi']
signifnicant_corr_granger = signifnicant_corr_granger[~signifnicant_corr_granger.predictor.isin(excluded_cols)]
signifnicant_corr_granger


The remaining 16 significant predictors are listed below:

In [0]:
list(signifnicant_corr_granger.predictor)

## Further Filtering Based on Availability of Future Forecasts
For quarterly data, since they are interpolated, a future or forecasted value is required to make predictions for future horizons. As such, an additional round of feature reduction was conducted on the 16 predictors listed above, retaining only those features for which forecasted values were available. See datalog in the data folder for further details. Remaining predictors after filtering are as below:

`card_consumables`, `card_credit`, `household_loans`, `google_home_loan_rate_search`, `unemployment_rate`, `employed_labour_force`, `labour_force_participation_rate`
, `labour_cost_index`, `cpi`, `cpi_housing_household`, `production_based_gdp`


