In [1]:
import vectorbt as vbt
import numpy as np
import pandas as pd
import datetime
import plotly.express as px
from xbbg import blp
import os
import quantstats as qs
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import logging

# Import custom modules with an alias
import bloomberg_data as bd
import transformations as tr
import visuals as vis

In [2]:
# Main data retrieval and merging process
tickers = ['.MIDERCAD U Index', '.CADIG F Index', 'VIX Index', '.HYUSER U Index', '.IGUSER U Index','SPTSX INDEX','ECSURPUS Index','LEI YOY Index','ECRPUS 1Y Index','ECRPCA 1Y Index','GDGCAFJP Index','GDGCAFJP Index','.HARDATA G Index','CGERGLOB Index','.FRCRRM G Index']
fields = [['PX_LAST'], ['PX_LAST'], ['PX_LAST'], ['PX_LAST'], ['PX_LAST'], ['PX_LAST'],['PX_LAST'],['PX_LAST'],['PX_LAST'],['PX_LAST'],['PX_LAST'],['PX_LAST'],['PX_LAST'],['PX_LAST'],['PX_LAST'],['PX_LAST'],['PX_LAST']]
start_date = '2006-01-01'
end_date = '2025-12-31'
column_names = [['cad_ig_er_index'], ['cad_ig_sprds'], ['vix'], ['us_hy_er_index'], ['us_ig_er_index'],['tsx_index'],['us_eco_suprise'],['lei_yoy_index'],['us_recession_odds'],['cad_recession_odds'],['atlanta_fed'],['growth_surpise'],['hard_data'],['equity_revisions'],['fed_credit_model']]
frequency = 'm'  # Single frequency for all tickers

dataframes = []

for ticker, field, col_name in zip(tickers, fields, column_names):
    df = bd.get_single_ticker_data(ticker, field, start_date, end_date, freq=frequency, column_names=col_name)
    dataframes.append(df)
    logging.info(f"Data for {ticker}:")
    logging.info(df.head())  # Print the first few rows of each dataframe

# Merge all dataframes
merged_data = bd.merge_dataframes(dataframes, method='outer')

# Print the final merged data and its information
logging.info("Merged data head:")
logging.info(merged_data.head())
logging.info('----------------------------------------------------------------')
logging.info('----------------------------------------------------------------')
logging.info(merged_data.tail())
logging.info(merged_data.info())

# Rename the index to "Date" and reset it
merged_data.index.name = 'Date'
csv_data = merged_data.reset_index()

# Save the dataframe to a CSV file
csv_data.to_csv('Outputs/csv_data.csv', index=False)

# Rename for further use
data = merged_data

2024-09-22 13:05:46,843 - INFO - Retrieving data for ticker: .MIDERCAD U Index with frequency: MONTHLY
2024-09-22 13:05:47,755 - INFO - Retrieved data shape for .MIDERCAD U Index: (225, 1)
2024-09-22 13:05:47,756 - INFO - Cleaned data shape for .MIDERCAD U Index: (225, 1)
2024-09-22 13:05:47,759 - INFO - Successfully retrieved data for ticker: .MIDERCAD U Index
2024-09-22 13:05:47,759 - INFO - Data for .MIDERCAD U Index:
2024-09-22 13:05:47,759 - INFO -             cad_ig_er_index
2006-01-31           1.0579
2006-02-28           1.0576
2006-03-31           1.0569
2006-04-28           1.0561
2006-05-31           1.0565
2024-09-22 13:05:47,762 - INFO - Retrieving data for ticker: .CADIG F Index with frequency: MONTHLY
2024-09-22 13:05:48,252 - INFO - Retrieved data shape for .CADIG F Index: (225, 1)
2024-09-22 13:05:48,253 - INFO - Cleaned data shape for .CADIG F Index: (225, 1)
2024-09-22 13:05:48,253 - INFO - Successfully retrieved data for ticker: .CADIG F Index
2024-09-22 13:05:48,25

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 289 entries, 2006-01-31 to 2024-09-30
Data columns (total 15 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   cad_ig_er_index     289 non-null    float64
 1   cad_ig_sprds        289 non-null    float64
 2   vix                 289 non-null    float64
 3   us_hy_er_index      289 non-null    float64
 4   us_ig_er_index      289 non-null    float64
 5   tsx_index           289 non-null    float64
 6   us_eco_suprise      289 non-null    float64
 7   lei_yoy_index       289 non-null    float64
 8   us_recession_odds   289 non-null    float64
 9   cad_recession_odds  289 non-null    float64
 10  atlanta_fed         289 non-null    float64
 11  growth_surpise      289 non-null    float64
 12  hard_data           289 non-null    float64
 13  equity_revisions    289 non-null    float64
 14  fed_credit_model    289 non-null    float64
dtypes: float64(15)
memory usage: 36.1 KB


In [3]:
data.tail()

Unnamed: 0_level_0,cad_ig_er_index,cad_ig_sprds,vix,us_hy_er_index,us_ig_er_index,tsx_index,us_eco_suprise,lei_yoy_index,us_recession_odds,cad_recession_odds,atlanta_fed,growth_surpise,hard_data,equity_revisions,fed_credit_model
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2024-06-30,1.3925,120.2679,12.44,1.127,1.4199,21875.79,-0.477,-4.9,30.0,25.0,2.248,2.248,-0.2787,-0.07,0.2
2024-07-31,1.3932,120.0034,16.36,1.1243,1.4226,23110.81,-0.464,-5.2,30.0,25.0,2.83,2.83,-0.3013,-0.09,0.18
2024-08-30,1.3905,122.7558,15.0,1.1282,1.4267,23346.18,-0.418,-5.2,30.0,25.0,2.531,2.531,-0.2806,-0.15,0.18
2024-08-31,1.3905,122.7558,15.0,1.1282,1.4267,23346.18,-0.418,-5.0,30.0,25.0,2.531,2.531,-0.2806,-0.15,0.18
2024-09-30,1.3959,118.4697,16.15,1.1282,1.4338,23867.37,-0.251,-5.0,30.0,25.0,2.93,2.93,-0.1544,-0.31,0.18


In [4]:
data.head()

Unnamed: 0_level_0,cad_ig_er_index,cad_ig_sprds,vix,us_hy_er_index,us_ig_er_index,tsx_index,us_eco_suprise,lei_yoy_index,us_recession_odds,cad_recession_odds,atlanta_fed,growth_surpise,hard_data,equity_revisions,fed_credit_model
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2006-01-31,1.0579,58.7572,12.95,0.5701,1.0877,11945.64,0.048,3.6,70.0,10.0,1.6,1.6,-0.0805,0.15,0.11
2006-02-28,1.0576,59.464,12.34,0.5731,1.091,11688.34,0.194,3.1,70.0,10.0,1.6,1.6,0.1658,0.13,0.13
2006-03-31,1.0569,59.6169,11.39,0.5828,1.0874,12110.61,0.221,3.7,70.0,10.0,1.6,1.6,0.0398,0.19,0.11
2006-04-28,1.0561,58.7984,11.59,0.5888,1.0881,12204.17,0.159,3.7,70.0,10.0,1.6,1.6,0.0808,0.27,0.11
2006-04-30,1.0561,58.7984,11.59,0.5888,1.0881,12204.17,0.159,2.6,70.0,10.0,1.6,1.6,0.0808,0.27,0.1


In [5]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import pandas as pd

# Assuming 'data' is your DataFrame containing time series data
# Example of what 'data' might look like (replace with your actual DataFrame)
# data = pd.DataFrame({...})  # Your actual data

# Create subplots
fig = make_subplots(rows=len(data.columns), cols=1, shared_xaxes=True, 
                    vertical_spacing=0.02, subplot_titles=data.columns)

# Add a line chart for each column
for i, col in enumerate(data.columns):
    fig.add_trace(
        go.Scatter(x=data.index, y=data[col], mode='lines', name=col),
        row=i+1, col=1
    )

# Update layout to improve appearance
fig.update_layout(height=300 * len(data.columns), width=900, 
                  title_text="Time Series Line Charts for All Columns in DataFrame",
                  showlegend=False)

# Show the figure
fig.show()



In [6]:
data

Unnamed: 0_level_0,cad_ig_er_index,cad_ig_sprds,vix,us_hy_er_index,us_ig_er_index,tsx_index,us_eco_suprise,lei_yoy_index,us_recession_odds,cad_recession_odds,atlanta_fed,growth_surpise,hard_data,equity_revisions,fed_credit_model
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2006-01-31,1.0579,58.7572,12.95,0.5701,1.0877,11945.64,0.048,3.6,70.0,10.0,1.600,1.600,-0.0805,0.15,0.11
2006-02-28,1.0576,59.4640,12.34,0.5731,1.0910,11688.34,0.194,3.1,70.0,10.0,1.600,1.600,0.1658,0.13,0.13
2006-03-31,1.0569,59.6169,11.39,0.5828,1.0874,12110.61,0.221,3.7,70.0,10.0,1.600,1.600,0.0398,0.19,0.11
2006-04-28,1.0561,58.7984,11.59,0.5888,1.0881,12204.17,0.159,3.7,70.0,10.0,1.600,1.600,0.0808,0.27,0.11
2006-04-30,1.0561,58.7984,11.59,0.5888,1.0881,12204.17,0.159,2.6,70.0,10.0,1.600,1.600,0.0808,0.27,0.10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-06-30,1.3925,120.2679,12.44,1.1270,1.4199,21875.79,-0.477,-4.9,30.0,25.0,2.248,2.248,-0.2787,-0.07,0.20
2024-07-31,1.3932,120.0034,16.36,1.1243,1.4226,23110.81,-0.464,-5.2,30.0,25.0,2.830,2.830,-0.3013,-0.09,0.18
2024-08-30,1.3905,122.7558,15.00,1.1282,1.4267,23346.18,-0.418,-5.2,30.0,25.0,2.531,2.531,-0.2806,-0.15,0.18
2024-08-31,1.3905,122.7558,15.00,1.1282,1.4267,23346.18,-0.418,-5.0,30.0,25.0,2.531,2.531,-0.2806,-0.15,0.18


In [8]:
import pandas as pd
import numpy as np
from statsmodels.tsa.stattools import adfuller
from sklearn.preprocessing import StandardScaler

data_transform = data.copy()

# Function to check stationarity and apply transformations
def make_stationary(df, lags=[1, 3, 6, 12]):
    for column in df.columns:
        result = adfuller(df[column].dropna())
        p_value = result[1]
        if p_value > 0.05:  # Non-stationary
            for lag in lags:
                df[f'{column}_pct_change_{lag}'] = df[column].pct_change(lag)
            df.drop(columns=[column], inplace=True)
    return df

# Apply the function to make data stationary
data_transform = make_stationary(data_transform)

# Standardize the data
scaler = StandardScaler()
data_transform = pd.DataFrame(scaler.fit_transform(data_transform.dropna()), 
                              columns=data_transform.columns, 
                              index=data_transform.dropna().index)

# Display the transformed data
data_transform.head()

Unnamed: 0_level_0,cad_ig_sprds,vix,us_eco_suprise,cad_recession_odds,atlanta_fed,growth_surpise,hard_data,equity_revisions,fed_credit_model,cad_ig_er_index_pct_change_1,...,tsx_index_pct_change_6,tsx_index_pct_change_12,lei_yoy_index_pct_change_1,lei_yoy_index_pct_change_3,lei_yoy_index_pct_change_6,lei_yoy_index_pct_change_12,us_recession_odds_pct_change_1,us_recession_odds_pct_change_3,us_recession_odds_pct_change_6,us_recession_odds_pct_change_12
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2006-11-30,-1.548541,-1.045809,-1.691909,-0.632883,-0.077401,-0.077401,-1.938334,0.866273,-0.625901,0.034871,...,0.864592,0.244983,4.508167,2.900222,-0.556866,-0.312732,-0.055387,-0.106243,-0.154987,-0.200569
2006-12-29,-1.567531,-0.970616,-1.840361,-0.632883,-0.077401,-0.077401,-1.601441,0.663563,-0.625901,-0.091032,...,0.787682,0.533349,-0.018227,-3.73175,-0.67884,-0.321542,-0.055387,-0.106243,-0.154987,-0.200569
2006-12-31,-1.567531,-0.970616,-1.840361,-0.632883,-0.077401,-0.077401,-1.601441,0.663563,-0.772374,-0.100012,...,0.548535,0.231993,-0.161922,1.058008,1.893452,-0.305351,-0.055387,-0.106243,-0.154987,-0.200569
2007-01-31,-1.602397,-1.102493,-1.211466,-0.632883,-0.077401,-0.077401,-0.449247,0.511531,-0.821198,0.070583,...,0.974672,0.24867,1.436685,0.280184,4.430507,-0.358495,-0.055387,-0.106243,-0.154987,-0.200569
2007-02-28,-1.647632,-0.524086,-1.187174,-0.632883,-0.077401,-0.077401,-0.46564,0.815596,-0.821198,0.141979,...,0.984773,0.25566,-0.018227,0.280184,-5.153923,-0.400966,-0.055387,-0.106243,-0.154987,-0.200569


In [10]:
data_transform.info()


<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 277 entries, 2006-11-30 to 2024-09-30
Data columns (total 33 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   cad_ig_sprds                     277 non-null    float64
 1   vix                              277 non-null    float64
 2   us_eco_suprise                   277 non-null    float64
 3   cad_recession_odds               277 non-null    float64
 4   atlanta_fed                      277 non-null    float64
 5   growth_surpise                   277 non-null    float64
 6   hard_data                        277 non-null    float64
 7   equity_revisions                 277 non-null    float64
 8   fed_credit_model                 277 non-null    float64
 9   cad_ig_er_index_pct_change_1     277 non-null    float64
 10  cad_ig_er_index_pct_change_3     277 non-null    float64
 11  cad_ig_er_index_pct_change_6     277 non-null    float64
 12  cad

In [12]:
# Generate lagged features for lags between 1 to 12
lags = range(1, 13)
lagged_features = pd.DataFrame(index=data_transform.index)

for column in data_transform.columns:
    for lag in lags:
        lagged_features[f'{column}_lag_{lag}'] = data_transform[column].shift(lag)

# Combine the lagged features with the target variables
targets = data_transform[['cad_ig_er_index_pct_change_1', 'cad_ig_er_index_pct_change_3', 'cad_ig_er_index_pct_change_6', 'cad_ig_er_index_pct_change_12']]
combined_data = pd.concat([lagged_features, targets], axis=1)

# Drop rows with NaN values resulting from the lagging process
combined_data.dropna(inplace=True)

# Calculate the correlation between each lagged feature and the target variables
correlation_matrix = combined_data.corr()

# Extract the correlations for the target variables
target_columns = ['cad_ig_er_index_pct_change_1', 'cad_ig_er_index_pct_change_3', 'cad_ig_er_index_pct_change_6', 'cad_ig_er_index_pct_change_12']
correlations = correlation_matrix[target_columns].drop(target_columns)

correlations

Unnamed: 0,cad_ig_er_index_pct_change_1,cad_ig_er_index_pct_change_3,cad_ig_er_index_pct_change_6,cad_ig_er_index_pct_change_12
cad_ig_sprds_lag_1,0.125650,-0.112430,-0.308362,-0.531616
cad_ig_sprds_lag_2,0.156307,0.067120,-0.162406,-0.444543
cad_ig_sprds_lag_3,0.196432,0.258533,-0.008165,-0.343179
cad_ig_sprds_lag_4,0.262409,0.329038,0.148543,-0.227163
cad_ig_sprds_lag_5,0.296619,0.401623,0.302024,-0.105901
...,...,...,...,...
us_recession_odds_pct_change_12_lag_8,0.063047,0.113084,0.124390,-0.002406
us_recession_odds_pct_change_12_lag_9,0.060679,0.107615,0.126889,0.047760
us_recession_odds_pct_change_12_lag_10,0.058418,0.094451,0.125075,0.090889
us_recession_odds_pct_change_12_lag_11,0.073259,0.099141,0.128933,0.133909


In [13]:
from statsmodels.tsa.stattools import grangercausalitytests

# Function to perform Granger Causality Test
def granger_causality_tests(data, target, max_lag=12):
    results = {}
    for column in data.columns:
        if column != target:
            test_result = grangercausalitytests(data[[target, column]], max_lag, verbose=False)
            p_values = [round(test_result[i+1][0]['ssr_ftest'][1], 4) for i in range(max_lag)]
            results[column] = p_values
    return results

# Perform Granger Causality Test for each target variable
max_lag = 12
granger_results = {}
for target in target_columns:
    granger_results[target] = granger_causality_tests(combined_data, target, max_lag)

granger_results

{'cad_ig_er_index_pct_change_1': {'cad_ig_sprds_lag_1': [0.0212,
   0.0249,
   0.0058,
   0.0004,
   0.004,
   0.0075,
   0.0089,
   0.0025,
   0.0037,
   0.0094,
   0.0276,
   0.0135],
  'cad_ig_sprds_lag_2': [0.0034,
   0.0,
   0.001,
   0.0004,
   0.0036,
   0.0051,
   0.001,
   0.0022,
   0.003,
   0.0061,
   0.0208,
   0.0124],
  'cad_ig_sprds_lag_3': [0.0001,
   0.0001,
   0.001,
   0.0004,
   0.0025,
   0.0008,
   0.0008,
   0.0019,
   0.0022,
   0.0036,
   0.0208,
   0.012],
  'cad_ig_sprds_lag_4': [0.0,
   0.0001,
   0.0011,
   0.0021,
   0.0043,
   0.0071,
   0.0055,
   0.0091,
   0.0062,
   0.0143,
   0.0634,
   0.0468],
  'cad_ig_sprds_lag_5': [0.0,
   0.0001,
   0.003,
   0.0032,
   0.0036,
   0.0038,
   0.0037,
   0.0048,
   0.0063,
   0.0148,
   0.0675,
   0.0389],
  'cad_ig_sprds_lag_6': [0.0,
   0.0001,
   0.0017,
   0.0041,
   0.0022,
   0.0022,
   0.0016,
   0.0048,
   0.0063,
   0.0148,
   0.0409,
   0.0393],
  'cad_ig_sprds_lag_7': [0.0,
   0.0001,
   0.0023,
   0.

# Interpretation of Granger Causality Test Results

The Granger Causality Test helps us determine whether one time series can predict another. In this context, we are testing whether lagged features can predict the target variables (`cad_ig_er_index_pct_change_1`, `cad_ig_er_index_pct_change_3`, `cad_ig_er_index_pct_change_6`, and `cad_ig_er_index_pct_change_12`).

## Key Points to Understand

- **Null Hypothesis:** The lagged feature does not Granger-cause the target variable.
- **Alternative Hypothesis:** The lagged feature Granger-causes the target variable.
- **P-Value:** The probability of observing the test results under the null hypothesis. Lower p-values indicate stronger evidence against the null hypothesis.

### Interpretation of P-Values

- **P-Value < 0.05:** Strong evidence against the null hypothesis, suggesting that the lagged feature Granger-causes the target variable.
- **P-Value >= 0.05:** Weak evidence against the null hypothesis, suggesting that the lagged feature does not Granger-cause the target variable.

## Summary of Results

Let's summarize the results for each target variable.

### `cad_ig_er_index_pct_change_1`

The following lagged features have p-values less than 0.05 for multiple lags, indicating strong evidence that they Granger-cause `cad_ig_er_index_pct_change_1`:

- `cad_ig_sprds_lag_1` to `cad_ig_sprds_lag_6`

### `cad_ig_er_index_pct_change_3`

The following lagged features have p-values less than 0.05 for multiple lags, indicating strong evidence that they Granger-cause `cad_ig_er_index_pct_change_3`:

- `cad_ig_sprds_lag_1` to `cad_ig_sprds_lag_6`

### `cad_ig_er_index_pct_change_6`

The following lagged features have p-values less than 0.05 for multiple lags, indicating strong evidence that they Granger-cause `cad_ig_er_index_pct_change_6`:

- `cad_ig_sprds_lag_1` to `cad_ig_sprds_lag_6`

### `cad_ig_er_index_pct_change_12`

The following lagged features have p-values less than 0.05 for multiple lags, indicating strong evidence that they Granger-cause `cad_ig_er_index_pct_change_12`:

- `cad_ig_sprds_lag_1` to `cad_ig_sprds_lag_6`

## Conclusion

The Granger Causality Test results suggest that the lagged features from `cad_ig_sprds_lag_1` to `cad_ig_sprds_lag_6` have strong predictive power for all four target variables. These features can be used in predictive models to forecast the target variables.


