In [1]:
import vectorbt as vbt
import numpy as np
import pandas as pd
import datetime
import plotly.express as px
from xbbg import blp
import os
import quantstats as qs
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Import custom modules with an alias
import bloomberg_data as bd
import transformations as tr


ModuleNotFoundError: No module named 'bloomberg_data'

In [2]:
# Getting the data from the bloomberg_data module 
tickers = ['.MIDERCAD U Index', '.CADIG F Index', 'VIX Index','.HYUSER U Index','.IGUSER U Index']
fields = [['PX_LAST'], ['PX_LAST'], ['PX_LAST'],['PX_LAST'], ['PX_LAST']]
start_date = '2000-01-01'
end_date = '2025-12-31'
column_names = [['cad_ig_er_index'], ['cad_ig_sprds'], ['vix'], ['us_hy_er_index'], ['us_ig_er_index']]
frequencies = ['D', 'D', 'D','D','D']  # You can edit the frequency for each ticker here

dataframes = []

for ticker, field, col_name, freq in zip(tickers, fields, column_names, frequencies):
    df = bd.get_single_ticker_data(ticker, field, start_date, end_date, freq=freq, column_names=col_name)
    dataframes.append(df)

# Getting risk-free index
#rate_df = bd.get_single_ticker_data('GCAN3M Index', ['PX_LAST'], start_date, end_date)
#risk_free_idx = tr.risk_free_index(rate_df,col_name="risk_free")  # Ensure the default col_name is applied

# Merge all dataframes including the risk-free index
merged_data = bd.merge_dataframes(dataframes)
#merged_data = bd.merge_dataframes([merged_data, risk_free_idx])

# Print the final merged data and its information
print(merged_data)
print('----------------------------------------------------------------')
print('----------------------------------------------------------------')
print(merged_data.info())

# Rename
data= merged_data

2024-07-03 12:43:43,205 - INFO - Successfully retrieved data for ticker: .MIDERCAD U Index
2024-07-03 12:43:43,870 - INFO - Successfully retrieved data for ticker: .CADIG F Index
2024-07-03 12:43:44,229 - INFO - Successfully retrieved data for ticker: VIX Index
2024-07-03 12:43:44,834 - INFO - Successfully retrieved data for ticker: .HYUSER U Index
2024-07-03 12:43:45,471 - INFO - Successfully retrieved data for ticker: .IGUSER U Index
2024-07-03 12:43:45,480 - INFO - Merged 5 dataframes using inner method.


            cad_ig_er_index  cad_ig_sprds    vix  us_hy_er_index  \
2002-11-29           1.0143       69.8153  27.50          0.4183   
2002-12-31           1.0146       77.3398  28.62          0.4134   
2003-01-31           1.0155       74.8880  31.17          0.4285   
2003-02-28           1.0159      106.9295  29.63          0.4265   
2003-03-31           1.0142      117.3892  29.15          0.4406   
...                     ...           ...    ...             ...   
2024-06-26           1.3935      120.6997  12.55          1.1236   
2024-06-27           1.3933      120.9534  12.24          1.1215   
2024-06-28           1.3925      120.2679  12.44          1.1270   
2024-07-01           1.3925      121.6130  12.22          1.1319   
2024-07-02           1.3958      122.1411  12.03          1.1290   

            us_ig_er_index  
2002-11-29          1.0150  
2002-12-31          1.0195  
2003-01-31          1.0269  
2003-02-28          1.0303  
2003-03-31          1.0351  
...      

In [3]:
print(data.info())

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 2035 entries, 2002-11-29 to 2024-07-02
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   cad_ig_er_index  2035 non-null   float64
 1   cad_ig_sprds     2035 non-null   float64
 2   vix              2035 non-null   float64
 3   us_hy_er_index   2035 non-null   float64
 4   us_ig_er_index   2035 non-null   float64
dtypes: float64(5)
memory usage: 95.4 KB
None


In [4]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import vectorbt as vbt

# Load your dataset (assuming it's already loaded as 'data')

# % Diff of the Last Value vs 10/20/50 Period MA for each index
for col in ['us_hy_er_index', 'cad_ig_sprds', 'vix', 'us_ig_er_index']:
    data[f'{col}_10d_diff'] = (data[col] - data[col].rolling(window=10).mean()) / data[col].rolling(window=10).mean()
    data[f'{col}_20d_diff'] = (data[col] - data[col].rolling(window=20).mean()) / data[col].rolling(window=20).mean()
    data[f'{col}_50d_diff'] = (data[col] - data[col].rolling(window=50).mean()) / data[col].rolling(window=50).mean()

# Last values of cad_ig_sprds and vix
data['cad_ig_sprds_last'] = data['cad_ig_sprds']
data['vix_last'] = data['vix']

# % Change Over the Last 10/20/50 Periods for each index
for col in ['us_hy_er_index', 'cad_ig_sprds', 'vix', 'us_ig_er_index']:
    data[f'{col}_10d_pct_change'] = data[col].pct_change(periods=10)
    data[f'{col}_20d_pct_change'] = data[col].pct_change(periods=20)
    data[f'{col}_50d_pct_change'] = data[col].pct_change(periods=50)

# Volatility of the % Change Over the Last 10/20/50 Periods for each index
for col in ['us_hy_er_index', 'cad_ig_sprds', 'vix', 'us_ig_er_index']:
    data[f'{col}_10d_vol'] = data[col].pct_change().rolling(window=10).std()
    data[f'{col}_20d_vol'] = data[col].pct_change().rolling(window=20).std()
    data[f'{col}_50d_vol'] = data[col].pct_change().rolling(window=50).std()

# Calculate future returns for cad_ig_er_index
data['cad_ig_er_index_return'] = data['cad_ig_er_index'].pct_change(periods=20).shift(-20)

# Drop rows with missing values
data.dropna(inplace=True)

# Ensure there are no missing values left
assert data.isna().sum().sum() == 0, "There are still missing values in the data."

# Split data into training and testing sets
train = data[:'2018-12-31']
test = data['2019-01-01':]

X_train = train.drop(columns=['cad_ig_er_index', 'cad_ig_er_index_return'])
y_train = train['cad_ig_er_index_return']
X_test = test.drop(columns=['cad_ig_er_index', 'cad_ig_er_index_return'])
y_test = test['cad_ig_er_index_return']

# Check the shapes to ensure we have data
print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_test shape: {y_test.shape}")

# Random Forest for feature importance
rf = RandomForestRegressor(n_estimators=100)
rf.fit(X_train, y_train)

# Select features based on importance
sfm = SelectFromModel(rf, threshold='mean')
sfm.fit(X_train, y_train)

selected_features = X_train.columns[sfm.get_support()]

print("Selected features:", selected_features)




X_train shape: (638, 42)
y_train shape: (638,)
X_test shape: (1327, 42)
y_test shape: (1327,)
Selected features: Index(['cad_ig_sprds', 'us_hy_er_index', 'cad_ig_sprds_last',
       'us_hy_er_index_20d_pct_change', 'us_hy_er_index_50d_pct_change',
       'cad_ig_sprds_50d_pct_change', 'us_ig_er_index_20d_pct_change',
       'us_ig_er_index_50d_pct_change'],
      dtype='object')


In [5]:
# Verify the date range of the dataset
print(data.index.min())
print(data.index.max())





2007-02-28 00:00:00
2024-06-04 00:00:00
