In [0]:
%pip install -r requirements.txt

In [0]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from plotly.offline import plot
from plotly.subplots import make_subplots
from statsmodels.tsa.seasonal import STL
import plotly.graph_objects as go
from functools import reduce 
import glob

In [0]:
filepath = 'data/processed/interpolated/combined_df.csv'
df_total = pd.read_csv(filepath)

type_filepath = 'data/archived/processed/combined_df.csv'
df_type = pd.read_csv(type_filepath)
df_type = df_type[['date','transaction_balances','savings_balances','term_deposit_balances']]

df = df_type.merge(df_total, on='date', how = 'left')
df = df.dropna()
df

## Correlation 

In [0]:
def decompose_stl(df, date_col, target_col, predictor_cols, period=12):
    df = df.sort_values(date_col).copy()
    df[date_col] = pd.to_datetime(df[date_col])
    df.set_index(date_col, inplace=True)

    all_cols = [target_col] + predictor_cols

    trend_data = {}
    seasonal_data = {}
    residual_data = {}

    for col in all_cols:
        series = pd.to_numeric(df[col], errors='coerce')
        stl = STL(series, period=period).fit()

        trend_data[col] = stl.trend
        seasonal_data[col] = stl.seasonal
        residual_data[col] = stl.resid

    # Create three separate DataFrames each for the STL decompositions
    trend_df = pd.DataFrame(trend_data, index=df.index)
    seasonal_df = pd.DataFrame(seasonal_data, index=df.index)
    residual_df = pd.DataFrame(residual_data, index=df.index)

    return trend_df, seasonal_df, residual_df

def compute_df_correlation(df, date_col = 'date'):
    return df.corr()

def plot_target_correlation(df, target_col, type = 'trend'):
    df = df[[target_col]].copy()
    df[f'abs_{type}_correlation'] = df[target_col].abs()
    df.sort_values(f'abs_{type}_correlation', ascending=False, inplace=True)
    # plot heatmap
    plt.figure(figsize=(10, 15)) 
    sns.heatmap(df[[f'abs_{type}_correlation']], annot=True, cmap='coolwarm', vmin = 0, vmax = 1)
    plt.title(f'Correlation with Household Deposit ({type})')

import matplotlib.pyplot as plt

def plot_multi_plots(df, date_col, target_col, predictor_cols, lags=None, ncols=3, figsize_per_plot=(6, 3)):
    nplots = len(predictor_cols)
    nrows = (nplots + ncols - 1) // ncols

    fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(figsize_per_plot[0]*ncols, figsize_per_plot[1]*nrows), sharex=True)
    axes = axes.flatten() if nplots > 1 else [axes]

    # Default: no lagging
    if lags is None:
        lags = [None] * nplots
    elif len(lags) != nplots:
        raise ValueError("Length of 'lags' must match length of 'predictor_cols'")

    for i, (pred, lag) in enumerate(zip(predictor_cols, lags)):
        ax1 = axes[i]
        ax2 = ax1.twinx()  # create a second y-axis

        # Plot target on primary y-axis
        ax1.plot(df[date_col], df[target_col], color='blue', label=target_col)
        ax1.set_ylabel(target_col, color='blue')
        ax1.tick_params(axis='y', colors='blue')

        # Plot original predictor on secondary y-axis
        ax2.plot(df[date_col], df[pred], color='red', label=pred)

        # If lag is provided, plot lagged version as dotted line
        if lag is not None:
            lagged = df[pred].shift(lag)
            ax2.plot(df[date_col], lagged, color='red', linestyle='dotted', label=f"{pred} (lag {lag})")

        ax2.set_ylabel(pred, color='red')
        ax2.tick_params(axis='y', colors='red')

        ax1.set_title(f"{target_col} vs {pred}" + (f" (lag {lag})" if lag else ""))
        ax1.grid(True)

        # Combine legends from both axes
        lines_1, labels_1 = ax1.get_legend_handles_labels()
        lines_2, labels_2 = ax2.get_legend_handles_labels()
        ax1.legend(lines_1 + lines_2, labels_1 + labels_2, loc='upper left')

    # Hide any unused subplots
    for j in range(i+1, len(axes)):
        axes[j].set_visible(False)

    plt.tight_layout()
    plt.show()


## Granger Causality Test
from statsmodels.tsa.stattools import grangercausalitytests

def granger_causality_tests(df, target_col, predictor_cols, maxlag=12):
    rows = []
    for pred in predictor_cols:
        test_result = grangercausalitytests(df[[target_col, pred]], maxlag=maxlag, verbose=False)
        # Extract all p-values for the predictor across lags
        lag_pvals = {lag: test_result[lag][0]['ssr_ftest'][1] for lag in test_result}
        
        # Find lags with significant p-values
        significant_lags = [lag for lag, pval in lag_pvals.items() if pval <= 0.05]
        if significant_lags:
            # Find minimum p-value and corresponding lag
            min_lag = min(significant_lags, key=lambda lag: lag_pvals[lag])
            min_pval = lag_pvals[min_lag]
            rows.append({
                'predictor': pred,
                'min_pvalue_lag': min_lag,
                'min_pvalue': min_pval,
                'significant_lags': significant_lags
            })
    
    results_df = pd.DataFrame(rows).sort_values('min_pvalue').reset_index(drop=True)
    return results_df

## Correlations With Lags
def lagged_correlations(df, target_col, predictor_cols, maxlag=12):
    results = []
    target = df[target_col]
    
    for pred in predictor_cols:
        lag_corrs = {lag: target.corr(df[pred].shift(lag)) for lag in range(maxlag + 1)}
        sig_lags = [lag for lag, c in lag_corrs.items() if pd.notna(c)]
        
        if sig_lags:
            best_lag = max(sig_lags, key=lambda lag: abs(lag_corrs[lag]))
            results.append({
                'predictor': pred,
                'max_corr_lag': best_lag,
                'max_abs_corr': abs(lag_corrs[best_lag]),
                'max_raw_corr': lag_corrs[best_lag],
                'significant_lags': sig_lags
            })
        else:
            results.append({'predictor': pred, 'max_corr_lag': None, 'max_abs_corr': None, 'max_raw_corr':None,'significant_lags': []})
    
    return pd.DataFrame(results).sort_values(by='max_abs_corr', key=lambda x: x.abs(), ascending=False).reset_index(drop=True)

def prepare_component(lagcorr_df, granger_df, component):
    df = pd.merge(
        lagcorr_df, granger_df,
        on='predictor', how='outer',
        suffixes=('_corr', '_granger')
    )
    df.columns = ['predictor'] + [
        f"{component}_{col}" if col != 'predictor' else col
        for col in df.columns[1:]
    ]
    return df


def plot_single_series(df, date_col, target_col, predictor_col, lag=0, invert=False):
    fig, ax1 = plt.subplots(figsize=(8, 4))
    ax2 = ax1.twinx()

    # Plot target
    ax1.plot(df[date_col], df[target_col], color='blue', label=target_col)
    ax1.set_ylabel(target_col, color='blue')
    ax1.tick_params(axis='y', colors='blue')

    # Prepare predictor
    pred_series = df[predictor_col].shift(lag)
    label = predictor_col

    if lag:
        label += f" (lag {lag})"
    if invert:
        pred_series = -pred_series
        label += " (inverted)"

    # Plot predictor
    ax2.plot(df[date_col], pred_series, color='red', linestyle='dotted', label=label)
    ax2.set_ylabel(predictor_col, color='red')
    ax2.tick_params(axis='y', colors='red')

    ax1.set_title(f"{target_col} vs {predictor_col}")
    ax1.grid(True)

    # Legends
    lines1, labels1 = ax1.get_legend_handles_labels()
    lines2, labels2 = ax2.get_legend_handles_labels()
    ax1.legend(lines1 + lines2, labels1 + labels2, loc='upper left')

    plt.tight_layout()
    plt.show()



## Term Deposit Balances

In [0]:
# decompose STL
target_col = 'term_deposit_balances'
date_col = 'date'
predictor_cols = [col for col in df.columns if col not in ['date', target_col]]
tdf, sdf, rdf = decompose_stl(df, date_col, target_col, predictor_cols, period=12)

In [0]:
# test on stl decomposition - trend
trend_granger = granger_causality_tests(tdf, target_col, predictor_cols)
trend_lagcorr = lagged_correlations(tdf, target_col, predictor_cols)

# test on stl decomposition - seasonality
seasonal_granger = granger_causality_tests(sdf, target_col, predictor_cols)
seasonal_lagcorr = lagged_correlations(sdf, target_col, predictor_cols)

# test on stl decomposition - residual
residual_granger = granger_causality_tests(rdf, target_col, predictor_cols)
residual_lagcorr = lagged_correlations(rdf, target_col, predictor_cols)

In [0]:
## Plot lagged correlations
plot_target_correlation(trend_lagcorr.set_index('predictor'), 'max_abs_corr', 'trend')
plot_target_correlation(seasonal_lagcorr.set_index('predictor'), 'max_abs_corr', 'seasonal')

In [0]:
# Prepare each component
trend_df = prepare_component(trend_lagcorr, trend_granger, 'trend')
seasonal_df = prepare_component(seasonal_lagcorr, seasonal_granger, 'seasonal')
residual_df = prepare_component(residual_lagcorr, residual_granger, 'residual')

# Merge all components on 'predictor'
pivoted_result = reduce(
    lambda left, right: pd.merge(left, right, on='predictor', how='outer'),
    [trend_df, seasonal_df, residual_df]
).sort_values('predictor').reset_index(drop=True)

top_predictors = pivoted_result[['predictor','trend_max_abs_corr','trend_max_raw_corr','trend_min_pvalue','trend_max_corr_lag','trend_min_pvalue_lag','seasonal_max_abs_corr','seasonal_min_pvalue','seasonal_max_corr_lag','seasonal_min_pvalue_lag']].sort_values(by='trend_min_pvalue', ascending=True).dropna()

signifnicant_corr_granger = top_predictors[top_predictors.trend_max_abs_corr>0.9]
signifnicant_corr_granger

In [0]:
plot_multi_plots(tdf.reset_index(), 'date', target_col, signifnicant_corr_granger.predictor, lags=signifnicant_corr_granger.trend_max_corr_lag, ncols=3, figsize_per_plot=(6, 3))

## Transactional Balances

In [0]:
# decompose STL
target_col = 'transaction_balances'
date_col = 'date'
predictor_cols = [col for col in df.columns if col not in ['date', target_col,'savings_balances']]
tdf, sdf, rdf = decompose_stl(df, date_col, target_col, predictor_cols, period=12)

In [0]:
# test on stl decomposition - trend
trend_granger = granger_causality_tests(tdf, target_col, predictor_cols)
trend_lagcorr = lagged_correlations(tdf, target_col, predictor_cols)

# test on stl decomposition - seasonality
seasonal_granger = granger_causality_tests(sdf, target_col, predictor_cols)
seasonal_lagcorr = lagged_correlations(sdf, target_col, predictor_cols)

# test on stl decomposition - residual
residual_granger = granger_causality_tests(rdf, target_col, predictor_cols)
residual_lagcorr = lagged_correlations(rdf, target_col, predictor_cols)

In [0]:
## Plot lagged correlations
plot_target_correlation(trend_lagcorr.set_index('predictor'), 'max_abs_corr', 'trend')
plot_target_correlation(seasonal_lagcorr.set_index('predictor'), 'max_abs_corr', 'seasonal')

In [0]:
# Prepare each component
trend_df = prepare_component(trend_lagcorr, trend_granger, 'trend')
seasonal_df = prepare_component(seasonal_lagcorr, seasonal_granger, 'seasonal')
residual_df = prepare_component(residual_lagcorr, residual_granger, 'residual')

# Merge all components on 'predictor'
pivoted_result = reduce(
    lambda left, right: pd.merge(left, right, on='predictor', how='outer'),
    [trend_df, seasonal_df, residual_df]
).sort_values('predictor').reset_index(drop=True)

top_predictors = pivoted_result[['predictor','trend_max_abs_corr','trend_max_raw_corr','trend_min_pvalue','trend_max_corr_lag','trend_min_pvalue_lag','seasonal_max_abs_corr','seasonal_min_pvalue','seasonal_max_corr_lag','seasonal_min_pvalue_lag']].sort_values(by='trend_min_pvalue', ascending=True).dropna()

signifnicant_corr_granger = top_predictors[top_predictors.trend_max_abs_corr>=0.8]
signifnicant_corr_granger

In [0]:
plot_multi_plots(tdf.reset_index(), 'date', target_col, signifnicant_corr_granger.predictor, lags=signifnicant_corr_granger.trend_max_corr_lag, ncols=3, figsize_per_plot=(6, 3))

In [0]:
for i,r in signifnicant_corr_granger.iterrows():
    if r['trend_max_raw_corr'] <0:
        predictor = r['predictor']
        plot_single_series(tdf.reset_index(), 'date', target_col, predictor, lag = r['trend_max_corr_lag'], invert = True)

## Saving Balances

In [0]:
# decompose STL
target_col = 'savings_balances'
date_col = 'date'
predictor_cols = [col for col in df.columns if col not in ['date', target_col,'transaction_balances']]
tdf, sdf, rdf = decompose_stl(df, date_col, target_col, predictor_cols, period=12)

In [0]:
# test on stl decomposition - trend
trend_granger = granger_causality_tests(tdf, target_col, predictor_cols)
trend_lagcorr = lagged_correlations(tdf, target_col, predictor_cols)

# test on stl decomposition - seasonality
seasonal_granger = granger_causality_tests(sdf, target_col, predictor_cols)
seasonal_lagcorr = lagged_correlations(sdf, target_col, predictor_cols)

# test on stl decomposition - residual
residual_granger = granger_causality_tests(rdf, target_col, predictor_cols)
residual_lagcorr = lagged_correlations(rdf, target_col, predictor_cols)

In [0]:
## Plot lagged correlations
plot_target_correlation(trend_lagcorr.set_index('predictor'), 'max_abs_corr', 'trend')
plot_target_correlation(seasonal_lagcorr.set_index('predictor'), 'max_abs_corr', 'seasonal')

In [0]:
# Prepare each component
trend_df = prepare_component(trend_lagcorr, trend_granger, 'trend')
seasonal_df = prepare_component(seasonal_lagcorr, seasonal_granger, 'seasonal')
residual_df = prepare_component(residual_lagcorr, residual_granger, 'residual')

# Merge all components on 'predictor'
pivoted_result = reduce(
    lambda left, right: pd.merge(left, right, on='predictor', how='outer'),
    [trend_df, seasonal_df, residual_df]
).sort_values('predictor').reset_index(drop=True)

top_predictors = pivoted_result[['predictor','trend_max_abs_corr','trend_max_raw_corr','trend_min_pvalue','trend_max_corr_lag','trend_min_pvalue_lag','seasonal_max_abs_corr','seasonal_min_pvalue','seasonal_max_corr_lag','seasonal_min_pvalue_lag']].sort_values(by='trend_min_pvalue', ascending=True).dropna()

signifnicant_corr_granger = top_predictors[top_predictors.trend_max_abs_corr>=0.9]
signifnicant_corr_granger

In [0]:
plot_multi_plots(tdf.reset_index(), 'date', target_col, signifnicant_corr_granger.predictor, lags=signifnicant_corr_granger.trend_max_corr_lag, ncols=3, figsize_per_plot=(6, 3))

In [0]:
for i,r in signifnicant_corr_granger.iterrows():
    if r['trend_max_raw_corr'] <0:
        predictor = r['predictor']
        plot_single_series(tdf.reset_index(), 'date', target_col, predictor, lag = r['trend_max_corr_lag'], invert = True)