In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import sys
from IPython.display import display, Markdown
from datetime import datetime, timedelta
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import math

# Set display options to show first and last 5 rows
pd.set_option('display.max_rows', 10)  # This will show 5 rows at the start and 5 at the end
import warnings
warnings.filterwarnings('ignore')

# Add project root to path
project_root = Path().absolute().parent
sys.path.append(str(project_root))

# Import utils with different aliases
from src.utils import csv_exporter as csv_utils
from src.utils import validation as val_utils
from src.utils import transformations as trans_utils
from src.utils import data_merger as merge_utils
from src.utils import config_validator as config_utils
from src.utils import metrics as metric_utils
from src.core.bloomberg_fetcher import fetch_bloomberg_data
from src.utils.transformations import get_ohlc

In [2]:
# Getting all the data 
mapping = {
    ('I05510CA Index', 'INDEX_OAS_TSY_BP'): 'cad_oas',
    ('LF98TRUU Index', 'INDEX_OAS_TSY_BP'): 'us_hy_oas',
    ('LUACTRUU Index', 'INDEX_OAS_TSY_BP'): 'us_ig_oas',
    ('SPTSX Index', 'PX_LAST'): 'tsx',
    ('VIX Index', 'PX_LAST'): 'vix',
    ('USYC3M30 Index', 'PX_LAST'): 'us_3m_10y',
    ('BCMPUSGR Index', 'PX_LAST'): 'us_growth_surprises',
    ('BCMPUSIF Index', 'PX_LAST'): 'us_inflation_surprises',
    ('LEI YOY  Index', 'PX_LAST'): 'us_lei_yoy',
    ('.HARDATA G Index', 'PX_LAST'): 'us_hard_data_surprises',
    ('CGERGLOB Index', 'PX_LAST'): 'us_equity_revisions',
    ('.ECONREGI G Index', 'PX_LAST'): 'us_economic_regime',
 
}

# Calculate dates
end_date = datetime.now().strftime('%Y-%m-%d')
start_date ='2002-01-01'

# Fetch the data
df = fetch_bloomberg_data(
    mapping=mapping,
    start_date=start_date,
    end_date=end_date,
    periodicity='D',
    align_start=True
).dropna()

print(df.info())
print('-------')
print('-------')
print(df.head())
print('-------')
print('-------')
print(df.tail())
print('-------')
print('-------')
print(df.describe())

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 5859 entries, 2002-10-31 to 2024-12-30
Data columns (total 12 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   cad_oas                 5859 non-null   float64
 1   us_hy_oas               5859 non-null   float64
 2   us_ig_oas               5859 non-null   float64
 3   tsx                     5859 non-null   float64
 4   vix                     5859 non-null   float64
 5   us_3m_10y               5859 non-null   float64
 6   us_growth_surprises     5859 non-null   float64
 7   us_inflation_surprises  5859 non-null   float64
 8   us_lei_yoy              5859 non-null   float64
 9   us_hard_data_surprises  5859 non-null   float64
 10  us_equity_revisions     5859 non-null   float64
 11  us_economic_regime      5859 non-null   float64
dtypes: float64(12)
memory usage: 595.1 KB
None
-------
-------
              cad_oas    us_hy_oas   us_ig_oas      tsx    vix  us_3m

In [3]:
# Viz to make sure all the data looks ok



def create_spread_plots(df):
    # Calculate number of rows and columns needed based on number of series
    n_series = len(df.columns)
    n_rows = math.ceil(n_series / 3)  # Calculate required rows
    n_cols = min(3, n_series)  # Use 3 columns or less if fewer series
    
    # Adjust vertical spacing based on number of rows
    vertical_spacing = min(0.08, 1.0 / (n_rows + 1))  # Dynamic spacing
    
    # Create subplot grid
    fig = make_subplots(
        rows=n_rows, 
        cols=n_cols,
        subplot_titles=df.columns,
        vertical_spacing=vertical_spacing,
        horizontal_spacing=0.05
    )
    
    # Add each series to a subplot
    for idx, column in enumerate(df.columns):
        row = (idx // n_cols) + 1
        col = (idx % n_cols) + 1
        
        fig.add_trace(
            go.Scatter(
                x=df.index,
                y=df[column],
                name=column,
                line=dict(width=1),
                showlegend=False,
                hovertemplate=
                "<b>%{x}</b><br>" +
                "Value: %{y:.2f}<br>" +
                "<extra></extra>"
            ),
            row=row,
            col=col
        )
        
        # Update axes labels
        fig.update_xaxes(
            title_text="Date",
            row=row,
            col=col,
            showgrid=True,
            gridcolor='rgba(128, 128, 128, 0.2)',
            tickangle=45,
            tickformat='%Y-%m-%d'
        )
        fig.update_yaxes(
            title_text="Spread",
            row=row,
            col=col,
            showgrid=True,
            gridcolor='rgba(128, 128, 128, 0.2)'
        )

    # Update layout for dark theme and responsiveness
    fig.update_layout(
        template='plotly_dark',
        showlegend=False,
        height=250 * n_rows,  # Adjusted height per row
        title={
            'text': 'Spread Series Over Time',
            'y':0.98,
            'x':0.5,
            'xanchor': 'center',
            'yanchor': 'top'
        },
        paper_bgcolor='rgb(30, 30, 30)',
        plot_bgcolor='rgb(30, 30, 30)',
        margin=dict(t=80, l=50, r=50, b=50),
        font=dict(
            family="Arial",
            size=10,
            color="white"
        )
    )

    # Make it responsive
    fig.update_layout(
        autosize=True,
    )
    
    # Show the plot
    fig.show(config={
        'responsive': True,
        'displayModeBar': True,
        'scrollZoom': True,
        'modeBarButtonsToAdd': ['drawline', 'drawopenpath', 'eraseshape']  # Add drawing tools
    })

# Create the plots
create_spread_plots(df)

In [4]:
# Ensure we have a datetime index (assuming df already has a datetime index from fetch_bloomberg_data)
if not isinstance(df.index, pd.DatetimeIndex):
    df.index = pd.to_datetime(df.index)

# Create a copy to avoid SettingWithCopyWarning
df_cleaned = df.copy()

# Fill missing values forward (if any)
df_cleaned = df_cleaned.fillna(method='ffill')

# Calculate returns and volatility for tsx only
df_cleaned['tsx_30d_pct_change'] = df_cleaned['tsx'].pct_change(periods=30) * 100
df_cleaned['tsx_90d_pct_change'] = df_cleaned['tsx'].pct_change(periods=90) * 100
df_cleaned['tsx_30d_rolling_std'] = df_cleaned['tsx'].rolling(window=30).std()
df_cleaned['tsx_90d_rolling_std'] = df_cleaned['tsx'].rolling(window=90).std()

# Drop the original tsx column
df_cleaned = df_cleaned.drop(columns=['tsx'])

# Drop any remaining NA values (from the rolling calculations)
df_cleaned = df_cleaned.dropna()

# Display information about the resulting DataFrame
df_cleaned.info()


<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 5790 entries, 2003-02-03 to 2024-12-27
Data columns (total 15 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   cad_oas                 5790 non-null   float64
 1   us_hy_oas               5790 non-null   float64
 2   us_ig_oas               5790 non-null   float64
 3   vix                     5790 non-null   float64
 4   us_3m_10y               5790 non-null   float64
 5   us_growth_surprises     5790 non-null   float64
 6   us_inflation_surprises  5790 non-null   float64
 7   us_lei_yoy              5790 non-null   float64
 8   us_hard_data_surprises  5790 non-null   float64
 9   us_equity_revisions     5790 non-null   float64
 10  us_economic_regime      5790 non-null   float64
 11  tsx_30d_pct_change      5790 non-null   float64
 12  tsx_90d_pct_change      5790 non-null   float64
 13  tsx_30d_rolling_std     5790 non-null   float64
 14  tsx_90d_rolling_std   

In [5]:


# Target definition
target_def = df_cleaned.copy()
target_def['cad_oas_30d_forward'] = df_cleaned['cad_oas'].shift(-30)
target_def = target_def.drop(columns=['cad_oas'])

# 1. Lagged Features
lag_periods = [1, 5, 10]
for col in [col for col in target_def.columns if col != 'cad_oas_30d_forward']:
    for lag in lag_periods:
        target_def[f'{col}_lag_{lag}'] = target_def[col].shift(lag)

# 2. Percentage Differences
diff_periods = [1, 5]
for col in [col for col in target_def.columns if col != 'cad_oas_30d_forward']:
    for diff in diff_periods:
        target_def[f'{col}_pct_diff_{diff}'] = target_def[col].pct_change(periods=diff) * 100

# 3. Rolling Statistics
rolling_window = [5, 20]
cols_for_rolling = ['vix', 'us_hy_oas', 'us_ig_oas']
for col in cols_for_rolling:
    for window in rolling_window:
        target_def[f'{col}_rolling_mean_{window}'] = target_def[col].rolling(window=window).mean()
        target_def[f'{col}_rolling_std_{window}'] = target_def[col].rolling(window=window).std()

# 4. Interaction Terms
target_def['us_hy_ig_ratio'] = target_def['us_hy_oas'] / target_def['us_ig_oas']
target_def['vix_times_spread'] = target_def['vix'] * target_def['us_3m_10y']

# --- Initial "Logical" Features ---
target_def['cad_us_hy_spread_diff'] = target_def['us_hy_oas'] - df_cleaned['cad_oas']
target_def['cad_us_ig_spread_diff'] = target_def['us_ig_oas'] - df_cleaned['cad_oas']
target_def['us_hy_oas_change_1d'] = target_def['us_hy_oas'].diff()
target_def['us_ig_oas_change_1d'] = target_def['us_ig_oas'].diff()
target_def['vix_div_us_hy_oas'] = target_def['vix'] / target_def['us_hy_oas']
target_def['vix_div_us_ig_oas'] = target_def['vix'] / target_def['us_ig_oas']
target_def['tsx_minus_us_equity_revisions'] = target_def['tsx_30d_pct_change'] - target_def['us_equity_revisions']
target_def['us_3m_10y_change_1d'] = target_def['us_3m_10y'].diff()
target_def['us_economic_regime_lag_1'] = target_def['us_economic_regime'].shift(1)

# --- Additional 20 Logical Features ---
target_def['us_hy_oas_change_accel'] = target_def['us_hy_oas_change_1d'].diff()
target_def['us_ig_oas_change_accel'] = target_def['us_ig_oas_change_1d'].diff()
target_def['us_hy_ig_oas_ratio'] = target_def['us_hy_oas'] / target_def['us_ig_oas']
target_def['vix_change_1d'] = target_def['vix'].diff()
target_def['growth_minus_inflation_surprises'] = target_def['us_growth_surprises'] - target_def['us_inflation_surprises']
target_def['us_equity_revisions_times_vix'] = target_def['us_equity_revisions'] * target_def['vix']
target_def['us_3m_10y_times_growth_surprises'] = target_def['us_3m_10y'] * target_def['us_growth_surprises']
target_def['vix_rolling_mean_20_change_1d'] = target_def['vix_rolling_mean_20'].diff()
target_def['tsx_volatility_ratio'] = target_def['tsx_30d_rolling_std'] / target_def['tsx_90d_rolling_std']
target_def['us_lei_yoy_change_1d'] = target_def['us_lei_yoy'].diff()
target_def['us_hard_data_surprises_change_1d'] = target_def['us_hard_data_surprises'].diff()
target_def['tsx_minus_us_hy_oas_change'] = target_def['tsx_30d_pct_change'] - target_def['us_hy_oas_change_1d']
target_def['vix_lag_1_times_us_hy_oas_lag_1'] = target_def['vix'].shift(1) * target_def['us_hy_oas'].shift(1)
target_def['us_growth_surprises_accel'] = target_def['us_growth_surprises'].diff().diff()
target_def['growth_surprises_over_vix'] = target_def['us_growth_surprises'] / target_def['vix']
target_def['tsx_90d_minus_us_ig_oas_change'] = target_def['tsx_90d_pct_change'] - target_def['us_ig_oas_change_1d']
target_def['economic_regime_times_vix'] = target_def['us_economic_regime'] * target_def['vix']
target_def['us_3m_10y_lag_1_times_growth_surprises_lag_1'] = target_def['us_3m_10y'].shift(1) * target_def['us_growth_surprises'].shift(1)
target_def['tsx_volatility_ratio_change_1d'] = target_def['tsx_volatility_ratio'].diff()
target_def['inflation_surprises_over_vix'] = target_def['us_inflation_surprises'] / target_def['vix']
target_def['us_hy_oas_times_us_ig_oas'] = target_def['us_hy_oas'] * target_def['us_ig_oas']

# Remove rows with NaN values
target_def = target_def.dropna()

target_def.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 5240 entries, 2003-03-03 to 2024-11-18
Columns: 212 entries, us_hy_oas to us_hy_oas_times_us_ig_oas
dtypes: float64(212)
memory usage: 8.5 MB


In [6]:
target_def

Unnamed: 0_level_0,us_hy_oas,us_ig_oas,vix,us_3m_10y,us_growth_surprises,us_inflation_surprises,us_lei_yoy,us_hard_data_surprises,us_equity_revisions,us_economic_regime,...,tsx_minus_us_hy_oas_change,vix_lag_1_times_us_hy_oas_lag_1,us_growth_surprises_accel,growth_surprises_over_vix,tsx_90d_minus_us_ig_oas_change,economic_regime_times_vix,us_3m_10y_lag_1_times_growth_surprises_lag_1,tsx_volatility_ratio_change_1d,inflation_surprises_over_vix,us_hy_oas_times_us_ig_oas
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2003-03-03,795.085376,163.221281,30.43,348.719,0.15366,-0.05386,2.2,0.1066,-0.14,0.69,...,0.977352,23705.078443,-0.05239,0.005050,5.516517,20.9967,57.972045,-0.032597,-0.001770,129774.853575
2003-03-04,795.094006,164.803276,31.83,348.555,0.14218,-0.05359,2.2,0.0888,-0.14,0.69,...,-4.021298,24194.447992,0.00158,0.004467,-0.129155,21.9627,53.584162,-0.002614,-0.001684,131034.096917
2003-03-05,798.273098,167.281176,30.38,349.362,0.14147,-0.05332,2.2,0.0888,-0.14,0.69,...,-8.109849,25307.842211,0.01077,0.004657,-0.220183,20.9622,49.557550,-0.009270,-0.001755,133536.062603
2003-03-06,794.152731,168.250404,31.37,353.995,0.14874,-0.06209,2.2,-0.0220,-0.14,0.69,...,-1.947674,24251.536717,0.00798,0.004741,-0.855329,21.6453,49.424242,-0.013117,-0.001979,133616.517828
2003-03-07,798.159309,169.850543,31.08,357.172,0.14354,-0.06178,2.2,0.0134,-0.22,0.69,...,-8.583391,24912.571171,-0.01247,0.004618,-2.731710,21.4452,52.653216,-0.004930,-0.001988,135567.792034
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-11-12,252.667800,73.670700,14.71,2.573,-0.19171,0.50917,-4.2,0.1419,-0.08,0.41,...,7.309211,3836.481660,0.01543,-0.013033,11.420301,6.0311,1.511043,0.002332,0.034614,18614.213693
2024-11-13,255.220300,75.585100,14.02,13.222,-0.19695,0.51127,-4.2,0.1419,-0.08,0.41,...,1.561693,3716.743338,-0.02172,-0.014048,8.930507,5.7482,-0.493270,-0.000903,0.036467,19290.851898
2024-11-14,254.170500,76.627700,14.31,5.270,-0.19550,0.55319,-4.2,0.1438,-0.08,0.41,...,5.560595,3578.188606,0.00669,-0.013662,9.437246,5.8671,-2.604073,-0.004996,0.038658,19476.500823
2024-11-15,264.826800,77.446800,16.14,10.540,-0.14768,0.59638,-4.2,0.1107,-0.07,0.41,...,-7.644029,3637.179855,0.04637,-0.009150,8.582404,6.6174,-1.030285,-0.004779,0.036950,20509.988214


In [8]:
# Split data chronologically
train_size = int(0.7 * len(target_def))
val_size = int(0.15 * len(target_def))
train_data = target_def.iloc[:train_size]
val_data = target_def.iloc[train_size:train_size + val_size]
test_data = target_def.iloc[train_size + val_size:]

X_train = train_data.drop('cad_oas_30d_forward', axis=1)
y_train = train_data['cad_oas_30d_forward']
X_val = val_data.drop('cad_oas_30d_forward', axis=1)
y_val = val_data['cad_oas_30d_forward']
X_test = test_data.drop('cad_oas_30d_forward', axis=1)
y_test = test_data['cad_oas_30d_forward']

print("Original feature shape:", X_train.shape)



Original feature shape: (3667, 211)


In [17]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_regression, RFE
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, Lasso  # Import LinearRegression here
from sklearn.decomposition import PCA
import numpy as np

# Assuming df_cleaned is your pre-existing DataFrame

# Target definition
target_def = df_cleaned.copy()
target_def['cad_oas_30d_forward'] = df_cleaned['cad_oas'].shift(-30)
target_def = target_def.drop(columns=['cad_oas'])

# 1. Lagged Features
lag_periods = [1, 5, 10]
for col in [col for col in target_def.columns if col != 'cad_oas_30d_forward']:
    for lag in lag_periods:
        target_def[f'{col}_lag_{lag}'] = target_def[col].shift(lag)

# 2. Percentage Differences
diff_periods = [1, 5]
for col in [col for col in target_def.columns if col != 'cad_oas_30d_forward']:
    for diff in diff_periods:
        target_def[f'{col}_pct_diff_{diff}'] = target_def[col].pct_change(periods=diff) * 100

# 3. Rolling Statistics
rolling_window = [5, 20]
cols_for_rolling = ['vix', 'us_hy_oas', 'us_ig_oas']
for col in cols_for_rolling:
    for window in rolling_window:
        target_def[f'{col}_rolling_mean_{window}'] = target_def[col].rolling(window=window).mean()
        target_def[f'{col}_rolling_std_{window}'] = target_def[col].rolling(window=window).std()

# 4. Interaction Terms
target_def['us_hy_ig_ratio'] = target_def['us_hy_oas'] / target_def['us_ig_oas']
target_def['vix_times_spread'] = target_def['vix'] * target_def['us_3m_10y']

# --- Initial "Logical" Features ---
target_def['cad_us_hy_spread_diff'] = target_def['us_hy_oas'] - df_cleaned['cad_oas']
target_def['cad_us_ig_spread_diff'] = target_def['us_ig_oas'] - df_cleaned['cad_oas']
target_def['us_hy_oas_change_1d'] = target_def['us_hy_oas'].diff()
target_def['us_ig_oas_change_1d'] = target_def['us_ig_oas'].diff()
target_def['vix_div_us_hy_oas'] = target_def['vix'] / target_def['us_hy_oas']
target_def['vix_div_us_ig_oas'] = target_def['vix'] / target_def['us_ig_oas']
target_def['tsx_minus_us_equity_revisions'] = target_def['tsx_30d_pct_change'] - target_def['us_equity_revisions']
target_def['us_3m_10y_change_1d'] = target_def['us_3m_10y'].diff()
target_def['us_economic_regime_lag_1'] = target_def['us_economic_regime'].shift(1)

# --- Additional 20 Logical Features ---
target_def['us_hy_oas_change_accel'] = target_def['us_hy_oas_change_1d'].diff()
target_def['us_ig_oas_change_accel'] = target_def['us_ig_oas_change_1d'].diff()
target_def['us_hy_ig_oas_ratio'] = target_def['us_hy_oas'] / target_def['us_ig_oas']
target_def['vix_change_1d'] = target_def['vix'].diff()
target_def['growth_minus_inflation_surprises'] = target_def['us_growth_surprises'] - target_def['us_inflation_surprises']
target_def['us_equity_revisions_times_vix'] = target_def['us_equity_revisions'] * target_def['vix']
target_def['us_3m_10y_times_growth_surprises'] = target_def['us_3m_10y'] * target_def['us_growth_surprises']
target_def['vix_rolling_mean_20_change_1d'] = target_def['vix_rolling_mean_20'].diff()
target_def['tsx_volatility_ratio'] = target_def['tsx_30d_rolling_std'] / target_def['tsx_90d_rolling_std']
target_def['us_lei_yoy_change_1d'] = target_def['us_lei_yoy'].diff()
target_def['us_hard_data_surprises_change_1d'] = target_def['us_hard_data_surprises'].diff()
target_def['tsx_minus_us_hy_oas_change'] = target_def['tsx_30d_pct_change'] - target_def['us_hy_oas_change_1d']
target_def['vix_lag_1_times_us_hy_oas_lag_1'] = target_def['vix'].shift(1) * target_def['us_hy_oas'].shift(1)
target_def['us_growth_surprises_accel'] = target_def['us_growth_surprises'].diff().diff()
target_def['growth_surprises_over_vix'] = target_def['us_growth_surprises'] / target_def['vix']
target_def['tsx_90d_minus_us_ig_oas_change'] = target_def['tsx_90d_pct_change'] - target_def['us_ig_oas_change_1d']
target_def['economic_regime_times_vix'] = target_def['us_economic_regime'] * target_def['vix']
target_def['us_3m_10y_lag_1_times_growth_surprises_lag_1'] = target_def['us_3m_10y'].shift(1) * target_def['us_growth_surprises'].shift(1)
target_def['tsx_volatility_ratio_change_1d'] = target_def['tsx_volatility_ratio'].diff()
target_def['inflation_surprises_over_vix'] = target_def['us_inflation_surprises'] / target_def['vix']
target_def['us_hy_oas_times_us_ig_oas'] = target_def['us_hy_oas'] * target_def['us_ig_oas']

# Replace infinite values with NaN
target_def = target_def.replace([np.inf, -np.inf], np.nan)

# Remove rows with NaN values
target_def = target_def.dropna()

# Split data chronologically
train_size = int(0.7 * len(target_def))
val_size = int(0.15 * len(target_def))
train_data = target_def.iloc[:train_size]
val_data = target_def.iloc[train_size:train_size + val_size]
test_data = target_def.iloc[train_size + val_size:]

X_train = train_data.drop('cad_oas_30d_forward', axis=1)
y_train = train_data['cad_oas_30d_forward']
X_val = val_data.drop('cad_oas_30d_forward', axis=1)
y_val = val_data['cad_oas_30d_forward']
X_test = test_data.drop('cad_oas_30d_forward', axis=1)
y_test = test_data['cad_oas_30d_forward']

print("Original feature shape:", X_train.shape)

# --- Feature Selection Methods ---

# 1. SelectKBest
selector_kbest = SelectKBest(score_func=f_regression, k=20)
X_train_kbest = selector_kbest.fit_transform(X_train, y_train)
selected_features_kbest_indices = selector_kbest.get_support()
selected_features_kbest = X_train.columns[selected_features_kbest_indices]
X_train_reduced_kbest = X_train[selected_features_kbest]
X_val_reduced_kbest = X_val[selected_features_kbest]
X_test_reduced_kbest = X_test[selected_features_kbest]
print("SelectKBest - Selected features:", selected_features_kbest.tolist())
print("SelectKBest - Reduced feature shape:", X_train_reduced_kbest.shape)

# 2. Feature Importance from Random Forest
rf_model = RandomForestRegressor(random_state=42)
rf_model.fit(X_train, y_train)
importances = rf_model.feature_importances_
feature_importances_rf = pd.Series(importances, index=X_train.columns).sort_values(ascending=False)
top_n_features_rf = feature_importances_rf.head(20).index
X_train_reduced_rf = X_train[top_n_features_rf]
X_val_reduced_rf = X_val[top_n_features_rf]
X_test_reduced_rf = X_test[top_n_features_rf]
print("Random Forest - Selected features:", top_n_features_rf.tolist())
print("Random Forest - Reduced feature shape:", X_train_reduced_rf.shape)

# 3. Recursive Feature Elimination (RFE)
estimator_rfe = LinearRegression()
selector_rfe = RFE(estimator_rfe, n_features_to_select=20, step=1)
selector_rfe = selector_rfe.fit(X_train, y_train)
selected_features_rfe = X_train.columns[selector_rfe.support_]
X_train_reduced_rfe = X_train[selected_features_rfe]
X_val_reduced_rfe = X_val[selected_features_rfe]
X_test_reduced_rfe = X_test[selected_features_rfe]
print("RFE - Selected features:", selected_features_rfe.tolist())
print("RFE - Reduced feature shape:", X_train_reduced_rfe.shape)

# 4. Feature Selection using Regularization (Lasso)
scaler_lasso = StandardScaler()
X_train_scaled_lasso = scaler_lasso.fit_transform(X_train)
X_val_scaled_lasso = scaler_lasso.transform(X_val)
X_test_scaled_lasso = scaler_lasso.transform(X_test)

lasso = Lasso(alpha=0.01)
lasso.fit(X_train_scaled_lasso, y_train)
selected_features_lasso = X_train.columns[lasso.coef_ != 0]
X_train_reduced_lasso = X_train[selected_features_lasso]
X_val_reduced_lasso = X_val[selected_features_lasso]
X_test_reduced_lasso = X_test[selected_features_lasso]
print("Lasso - Selected features:", selected_features_lasso.tolist())
print("Lasso - Reduced feature shape:", X_train_reduced_lasso.shape)

# 5. Dimensionality Reduction (PCA)
scaler_pca = StandardScaler()
X_train_scaled_pca = scaler_pca.fit_transform(X_train)
X_val_scaled_pca = scaler_pca.transform(X_val)
X_test_scaled_pca = scaler_pca.transform(X_test)

pca = PCA(n_components=20)
X_train_pca = pca.fit_transform(X_train_scaled_pca)
X_val_pca = pca.transform(X_val_scaled_pca)
X_test_pca = pca.transform(X_test_scaled_pca)
print("PCA - Reduced feature shape:", X_train_pca.shape)

Original feature shape: (3574, 211)
SelectKBest - Selected features: ['us_hy_oas', 'us_ig_oas', 'tsx_90d_rolling_std', 'us_hy_oas_lag_1', 'us_hy_oas_lag_5', 'us_hy_oas_lag_10', 'us_ig_oas_lag_1', 'us_ig_oas_lag_5', 'us_ig_oas_lag_10', 'tsx_90d_rolling_std_lag_1', 'tsx_90d_rolling_std_lag_5', 'vix_rolling_mean_5', 'us_hy_oas_rolling_mean_5', 'us_hy_oas_rolling_mean_20', 'us_ig_oas_rolling_mean_5', 'us_ig_oas_rolling_mean_20', 'cad_us_hy_spread_diff', 'us_equity_revisions_times_vix', 'vix_lag_1_times_us_hy_oas_lag_1', 'us_hy_oas_times_us_ig_oas']
SelectKBest - Reduced feature shape: (3574, 20)
Random Forest - Selected features: ['us_ig_oas', 'us_economic_regime', 'cad_us_ig_spread_diff', 'us_ig_oas_lag_1', 'us_ig_oas_rolling_mean_20', 'us_ig_oas_rolling_mean_5', 'us_economic_regime_lag_1', 'economic_regime_times_vix', 'inflation_surprises_over_vix', 'us_growth_surprises_lag_10', 'us_growth_surprises_lag_5', 'tsx_90d_pct_change_lag_5', 'us_growth_surprises_lag_1_pct_diff_1', 'us_hy_oas_ti