In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import sys
from IPython.display import display, Markdown
from datetime import datetime, timedelta
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import math

# Set display options to show first and last 5 rows
pd.set_option('display.max_rows', 10)  # This will show 5 rows at the start and 5 at the end
import warnings
warnings.filterwarnings('ignore')

# Add project root to path
project_root = Path().absolute().parent
sys.path.append(str(project_root))

# Import utils with different aliases
from src.utils import csv_exporter as csv_utils
from src.utils import validation as val_utils
from src.utils import transformations as trans_utils
from src.utils import data_merger as merge_utils
from src.utils import config_validator as config_utils
from src.utils import metrics as metric_utils
from src.core.bloomberg_fetcher import fetch_bloomberg_data
from src.utils.transformations import get_ohlc

In [10]:
# Getting all the data 
mapping = {
    ('I05510CA Index', 'INDEX_OAS_TSY_BP'): 'cad_oas',
    ('LF98TRUU Index', 'INDEX_OAS_TSY_BP'): 'us_hy_oas',
    ('LUACTRUU Index', 'INDEX_OAS_TSY_BP'): 'us_ig_oas',
    ('SPTSX Index', 'PX_LAST'): 'tsx',
    ('VIX Index', 'PX_LAST'): 'vix',
    ('USYC3M30 Index', 'PX_LAST'): 'us_3m_10y',
    ('BCMPUSGR Index', 'PX_LAST'): 'us_growth_surprises',
    ('BCMPUSIF Index', 'PX_LAST'): 'us_inflation_surprises',
    ('LEI YOY  Index', 'PX_LAST'): 'us_lei_yoy',
    ('.HARDATA G Index', 'PX_LAST'): 'us_hard_data_surprises',
    ('CGERGLOB Index', 'PX_LAST'): 'us_equity_revisions',
    ('.ECONREGI G Index', 'PX_LAST'): 'us_economic_regime',
 
}

# Calculate dates
end_date = datetime.now().strftime('%Y-%m-%d')
start_date ='2002-01-01'

# Fetch the data
df = fetch_bloomberg_data(
    mapping=mapping,
    start_date=start_date,
    end_date=end_date,
    periodicity='M',
    align_start=True
)

print(df.info())
print('-------')
print('-------')
print(df.head())
print('-------')
print('-------')
print(df.tail())
print('-------')
print('-------')
print(df.describe())

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 342 entries, 2002-10-31 to 2024-11-30
Data columns (total 12 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   cad_oas                 342 non-null    float64
 1   us_hy_oas               342 non-null    float64
 2   us_ig_oas               342 non-null    float64
 3   tsx                     342 non-null    float64
 4   vix                     342 non-null    float64
 5   us_3m_10y               342 non-null    float64
 6   us_growth_surprises     342 non-null    float64
 7   us_inflation_surprises  342 non-null    float64
 8   us_lei_yoy              342 non-null    float64
 9   us_hard_data_surprises  342 non-null    float64
 10  us_equity_revisions     342 non-null    float64
 11  us_economic_regime      342 non-null    float64
dtypes: float64(12)
memory usage: 34.7 KB
None
-------
-------
              cad_oas    us_hy_oas   us_ig_oas      tsx    vix  us_3m_1

In [12]:
# Viz to make sure all the data looks ok



def create_spread_plots(df):
    # Calculate number of rows and columns needed based on number of series
    n_series = len(df.columns)
    n_rows = math.ceil(n_series / 3)  # Calculate required rows
    n_cols = min(3, n_series)  # Use 3 columns or less if fewer series
    
    # Adjust vertical spacing based on number of rows
    vertical_spacing = min(0.08, 1.0 / (n_rows + 1))  # Dynamic spacing
    
    # Create subplot grid
    fig = make_subplots(
        rows=n_rows, 
        cols=n_cols,
        subplot_titles=df.columns,
        vertical_spacing=vertical_spacing,
        horizontal_spacing=0.05
    )
    
    # Add each series to a subplot
    for idx, column in enumerate(df.columns):
        row = (idx // n_cols) + 1
        col = (idx % n_cols) + 1
        
        fig.add_trace(
            go.Scatter(
                x=df.index,
                y=df[column],
                name=column,
                line=dict(width=1),
                showlegend=False,
                hovertemplate=
                "<b>%{x}</b><br>" +
                "Value: %{y:.2f}<br>" +
                "<extra></extra>"
            ),
            row=row,
            col=col
        )
        
        # Update axes labels
        fig.update_xaxes(
            title_text="Date",
            row=row,
            col=col,
            showgrid=True,
            gridcolor='rgba(128, 128, 128, 0.2)',
            tickangle=45,
            tickformat='%Y-%m-%d'
        )
        fig.update_yaxes(
            title_text="Spread",
            row=row,
            col=col,
            showgrid=True,
            gridcolor='rgba(128, 128, 128, 0.2)'
        )

    # Update layout for dark theme and responsiveness
    fig.update_layout(
        template='plotly_dark',
        showlegend=False,
        height=250 * n_rows,  # Adjusted height per row
        title={
            'text': 'Spread Series Over Time',
            'y':0.98,
            'x':0.5,
            'xanchor': 'center',
            'yanchor': 'top'
        },
        paper_bgcolor='rgb(30, 30, 30)',
        plot_bgcolor='rgb(30, 30, 30)',
        margin=dict(t=80, l=50, r=50, b=50),
        font=dict(
            family="Arial",
            size=10,
            color="white"
        )
    )

    # Make it responsive
    fig.update_layout(
        autosize=True,
    )
    
    # Show the plot
    fig.show(config={
        'responsive': True,
        'displayModeBar': True,
        'scrollZoom': True,
        'modeBarButtonsToAdd': ['drawline', 'drawopenpath', 'eraseshape']  # Add drawing tools
    })

# Create the plots
create_spread_plots(df)

In [14]:
# Initial data transformation- removing non stationary

import pandas as pd
import numpy as np
from statsmodels.tsa.stattools import adfuller

def transform_non_stationary_columns_ffill(
    df, 
    target_col='cad_oas', 
    pval_threshold=0.05
):
    """
    Identify non-stationary columns (excluding the target), transform them into 1-, 3-, 
    and 6-period percentage changes, forward-fill missing values, and drop any rows 
    that still contain NaNs.

    Parameters
    ----------
    df : pd.DataFrame
        Input DataFrame with a DatetimeIndex or similar time index.
    target_col : str
        Name of the target column to exclude from transformations (e.g., 'cad_oas').
    pval_threshold : float
        The p-value threshold for the Augmented Dickey-Fuller test. 
        Any column with p-value > this is considered non-stationary.

    Returns
    -------
    pd.DataFrame
        A new DataFrame with transformed columns and forward-filled values.
    """
    
    # Create a copy so we don't modify the original DataFrame in place
    df_transformed = df.copy()
    
    print("[STEP] Starting transformation on non-stationary columns.")
    print(f"[INFO] Target column to skip: '{target_col}'")
    print(f"[INFO] ADF test p-value threshold: {pval_threshold}")
    
    # List to keep track of columns to drop after transformation
    cols_to_drop = []
    
    # Iterate through all columns except the target
    for col in df_transformed.columns:
        if col == target_col:
            print(f"\n[INFO] Skipping stationarity check for target column '{col}'.")
            continue
        
        print(f"\n[INFO] Checking stationarity for column: {col}")
        
        # Drop NaNs before performing ADF
        col_series = df_transformed[col].dropna()
        
        if len(col_series) < 12:
            print(f"    -> Not enough data ({len(col_series)}) for ADF test in '{col}'. Skipping ADF.")
            # If there's not enough data, we'll keep the column unaltered (you can modify this logic if desired).
            continue
        
        # Perform the Augmented Dickey-Fuller test
        adf_result = adfuller(col_series, autolag='AIC')
        adf_p_value = adf_result[1]
        
        print(f"    -> ADF test p-value for '{col}': {adf_p_value:.6f}")
        
        # Check if the column is non-stationary
        if adf_p_value > pval_threshold:
            print(f"    -> '{col}' is non-stationary (p-value > {pval_threshold}). Transforming...")
            
            # Compute 1-period % change
            df_transformed[f"{col}_chg_1m"] = df_transformed[col].pct_change(1)
            print(f"    -> Created column '{col}_chg_1m' for 1-period % change.")
            
            # Compute 3-period % change
            df_transformed[f"{col}_chg_3m"] = df_transformed[col].pct_change(3)
            print(f"    -> Created column '{col}_chg_3m' for 3-period % change.")
            
            # Compute 6-period % change
            df_transformed[f"{col}_chg_6m"] = df_transformed[col].pct_change(6)
            print(f"    -> Created column '{col}_chg_6m' for 6-period % change.")
            
            # Mark the original column for dropping
            cols_to_drop.append(col)
        else:
            print(f"    -> '{col}' is stationary or borderline. Keeping it as is.")
    
    # Drop the original non-stationary columns
    if cols_to_drop:
        print(f"\n[STEP] Dropping original non-stationary columns: {cols_to_drop}")
        df_transformed.drop(columns=cols_to_drop, inplace=True)
    else:
        print("\n[INFO] No columns identified for dropping.")
    
    # Forward-fill missing values
    print("\n[STEP] Forward-filling missing values created by % change calculations.")
    df_transformed.fillna(method='ffill', inplace=True)
    
    # Drop any rows that still contain NaNs
    print("[STEP] Dropping rows that still have NaN values after forward-fill.")
    rows_before = df_transformed.shape[0]
    df_transformed.dropna(inplace=True)
    rows_after = df_transformed.shape[0]
    
    print(f"    -> Dropped {rows_before - rows_after} rows.")
    
    print("\n[STEP] Transformation complete!")
    print("[INFO] Final DataFrame shape:", df_transformed.shape)
    print("[INFO] Final columns:", df_transformed.columns.tolist())
    
    return df_transformed


# Example usage (assuming 'df' is your DataFrame):

df_transformed = transform_non_stationary_columns_ffill(
    df=df,
    target_col='cad_oas',     # we skip transforming 'cad_oas'
    pval_threshold=0.05       # typical threshold for ADF
)

df_transformed.info()


[STEP] Starting transformation on non-stationary columns.
[INFO] Target column to skip: 'cad_oas'
[INFO] ADF test p-value threshold: 0.05

[INFO] Skipping stationarity check for target column 'cad_oas'.

[INFO] Checking stationarity for column: us_hy_oas
    -> ADF test p-value for 'us_hy_oas': 0.022353
    -> 'us_hy_oas' is stationary or borderline. Keeping it as is.

[INFO] Checking stationarity for column: us_ig_oas
    -> ADF test p-value for 'us_ig_oas': 0.062877
    -> 'us_ig_oas' is non-stationary (p-value > 0.05). Transforming...
    -> Created column 'us_ig_oas_chg_1m' for 1-period % change.
    -> Created column 'us_ig_oas_chg_3m' for 3-period % change.
    -> Created column 'us_ig_oas_chg_6m' for 6-period % change.

[INFO] Checking stationarity for column: tsx
    -> ADF test p-value for 'tsx': 0.944986
    -> 'tsx' is non-stationary (p-value > 0.05). Transforming...
    -> Created column 'tsx_chg_1m' for 1-period % change.
    -> Created column 'tsx_chg_3m' for 3-period % c

In [17]:
# EDA

import pandas as pd
import io

def textual_eda_report(df, target_col='cad_oas'):
    """
    Conduct a purely text-based EDA on the provided DataFrame (df).
    This function prints numeric-based summaries without any plots or visualizations.

    Parameters
    ----------
    df : pd.DataFrame
        The DataFrame to analyze. 
    target_col : str, optional
        The name of the target column to focus correlation analysis on, 
        by default 'cad_oas'.

    Returns
    -------
    None
        Prints out the results directly.
    """

    print("============================================================")
    print("[STEP 1] BASIC SHAPE AND COLUMN INFORMATION")
    print("============================================================")

    # 1. Print the shape of the DataFrame
    print(f"[INFO] The DataFrame has {df.shape[0]} rows and {df.shape[1]} columns.")

    # 2. Print the list of columns
    print("\n[INFO] Here are the column names in the DataFrame:")
    for i, col in enumerate(df.columns, 1):
        print(f"    {i}. {col}")

    # 3. Mimic df.info() output using StringIO
    print("\n[INFO] Detailed info (similar to df.info()):")
    print("------------------------------------------------------------")
    info_buf = io.StringIO()           # Create an in-memory text buffer
    df.info(buf=info_buf)              # Write df.info() output to info_buf
    info_str = info_buf.getvalue()     # Get the string from info_buf
    info_buf.close()                   # Close the StringIO buffer
    print(info_str)

    print("============================================================")
    print("[STEP 2] MISSING VALUE ANALYSIS")
    print("============================================================")

    # Check for missing values
    missing_counts = df.isna().sum()
    total_rows = df.shape[0]

    print("[INFO] Missing value counts and percentages for each column:")
    for col in df.columns:
        miss_count = missing_counts[col]
        miss_percent = (miss_count / total_rows) * 100
        print(f"    -> {col}: {miss_count} missing values ({miss_percent:.2f}%).")

    print("============================================================")
    print("[STEP 3] DESCRIPTIVE STATISTICS")
    print("============================================================")

    # Print out descriptive stats for numeric columns
    print("[INFO] Descriptive statistics for numeric columns:")
    print("------------------------------------------------------------")
    desc_stats = df.describe()
    print(desc_stats)

    print("\n[INFO] Skewness and Kurtosis for numeric columns:")
    print("------------------------------------------------------------")
    numeric_cols = df.select_dtypes(include='number').columns
    for col in numeric_cols:
        skew_val = df[col].skew()
        kurt_val = df[col].kurt()
        print(f"    -> '{col}' | Skew: {skew_val:.4f}, Kurt: {kurt_val:.4f}")

    print("============================================================")
    print("[STEP 4] CORRELATION ANALYSIS")
    print("============================================================")

    # Print correlation matrix
    print("[INFO] Correlation matrix for numeric columns:")
    print("------------------------------------------------------------")
    corr_matrix = df.corr()
    print(corr_matrix)

    # If target column exists, print correlation specifically with the target
    if target_col in df.columns:
        print(f"\n[INFO] Correlation of features with the target '{target_col}':")
        if len(numeric_cols) > 1:
            target_corr = corr_matrix[target_col].sort_values(ascending=False)
            print("------------------------------------------------------------")
            print(target_corr)
        else:
            print("    -> Not enough numeric columns to compute correlation with target.")
    else:
        print(f"\n[INFO] Target column '{target_col}' not found in the DataFrame.")

    print("============================================================")
    print("[STEP 5] RECOMMENDATIONS / NEXT STEPS")
    print("============================================================")
    print("[INFO] Potential next steps based on this EDA:")
    print("1. Investigate any columns with strong positive/negative correlation to the target.")
    print("2. Inspect columns with high skew or kurtosis for potential outliers.")
    print("3. Confirm domain-based expectations for your credit and macro variables.")
    print("4. Consider normalizing or further transforming features as needed.")
    print("5. After cleaning & transformation, move to model experimentation.")
    print("\n[INFO] End of textual EDA report.")
    print("============================================================")


# Suppose df_transformed is the DataFrame you have with 336 rows and 22 columns
# including your target column 'cad_oas'.
textual_eda_report(df_transformed, target_col='cad_oas')

[STEP 1] BASIC SHAPE AND COLUMN INFORMATION
[INFO] The DataFrame has 336 rows and 22 columns.

[INFO] Here are the column names in the DataFrame:
    1. cad_oas
    2. us_hy_oas
    3. vix
    4. us_growth_surprises
    5. us_hard_data_surprises
    6. us_equity_revisions
    7. us_economic_regime
    8. us_ig_oas_chg_1m
    9. us_ig_oas_chg_3m
    10. us_ig_oas_chg_6m
    11. tsx_chg_1m
    12. tsx_chg_3m
    13. tsx_chg_6m
    14. us_3m_10y_chg_1m
    15. us_3m_10y_chg_3m
    16. us_3m_10y_chg_6m
    17. us_inflation_surprises_chg_1m
    18. us_inflation_surprises_chg_3m
    19. us_inflation_surprises_chg_6m
    20. us_lei_yoy_chg_1m
    21. us_lei_yoy_chg_3m
    22. us_lei_yoy_chg_6m

[INFO] Detailed info (similar to df.info()):
------------------------------------------------------------
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 336 entries, 2003-03-31 to 2024-11-30
Data columns (total 22 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------  

In [20]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import PowerTransformer, StandardScaler, RobustScaler
from scipy.stats import boxcox

def winsorize_series(s, lower_quant=0.01, upper_quant=0.99):
    """
    Winsorize (clip) values in a Series to the specified lower and upper quantiles.

    Parameters
    ----------
    s : pd.Series
        The data to winsorize.
    lower_quant : float
        Lower quantile for clipping. Example: 0.01 means clip everything below the 1st percentile.
    upper_quant : float
        Upper quantile for clipping. Example: 0.99 means clip everything above the 99th percentile.

    Returns
    -------
    pd.Series
        Winsorized Series with extreme values clipped.
    """
    lower_val = s.quantile(lower_quant)
    upper_val = s.quantile(upper_quant)
    clipped = s.clip(lower_val, upper_val)
    return clipped


def transform_column_boxcox(s):
    """
    Apply a Box-Cox transform to a strictly positive pd.Series.

    Parameters
    ----------
    s : pd.Series
        Series with strictly positive values.

    Returns
    -------
    pd.Series
        Transformed Series.
    float
        The lambda parameter from the Box-Cox transformation.
    """
    arr = s.values  # Box-Cox requires a NumPy array
    transformed, lam = boxcox(arr)
    return pd.Series(transformed, index=s.index), lam


def full_data_transformation_pipeline(
    df,
    winsorize_cols=None,
    winsor_lower_quant=0.01,
    winsor_upper_quant=0.99,
    log_cols=None,
    boxcox_cols=None,
    yeojohnson_cols=None,
    scale_method=None
):
    """
    A comprehensive data transformation pipeline with:
      1. Winsorizing / Outlier Capping
      2. Log / Box-Cox / Yeo-Johnson Transform
      3. Optional Feature Scaling (Standard or Robust)

    Parameters
    ----------
    df : pd.DataFrame
        The input DataFrame.
    winsorize_cols : list of str or None
        Columns to winsorize at [winsor_lower_quant, winsor_upper_quant].
    winsor_lower_quant : float
        Quantile boundary for lower clipping.
    winsor_upper_quant : float
        Quantile boundary for upper clipping.
    log_cols : list of str or None
        Columns to apply log(1+x) transform (must be >= 0).
    boxcox_cols : list of str or None
        Columns for Box-Cox transform (must be strictly > 0).
    yeojohnson_cols : list of str or None
        Columns for Yeo-Johnson transform (handles negatives/zero).
    scale_method : str or None
        If "standard", applies StandardScaler to all numeric columns.
        If "robust", applies RobustScaler to all numeric columns.
        If None, no scaling is performed.

    Returns
    -------
    pd.DataFrame
        A transformed copy of the original DataFrame.
    """
    df_transformed = df.copy()

    # ------------------------------
    # 1. WINSORIZE / OUTLIER CAPPING
    # ------------------------------
    if winsorize_cols:
        print(f"\n[STEP] Winsorizing columns: {winsorize_cols} "
              f"with lower={winsor_lower_quant}, upper={winsor_upper_quant}")
        for col in winsorize_cols:
            if col not in df_transformed.columns:
                print(f"[WARNING] Column '{col}' not found for winsorizing. Skipping.")
                continue
            df_transformed[col] = winsorize_series(
                df_transformed[col],
                lower_quant=winsor_lower_quant,
                upper_quant=winsor_upper_quant
            )
            print(f"    -> Winsorized '{col}'")

    # ------------------------------
    # 2. APPLY SELECTED TRANSFORMS
    # ------------------------------

    # 2.1 LOG TRANSFORM
    if log_cols:
        print(f"\n[STEP] Applying log(1+x) transform to columns: {log_cols}")
        for col in log_cols:
            if col not in df_transformed.columns:
                print(f"[WARNING] Column '{col}' not found for log transform. Skipping.")
                continue
            if (df_transformed[col] < 0).any():
                print(f"[WARNING] Column '{col}' has negative values. Skipping log transform.")
                continue
            df_transformed[col] = np.log1p(df_transformed[col])
            print(f"    -> Applied log(1+x) transform to '{col}'")

    # 2.2 BOX-COX
    if boxcox_cols:
        print(f"\n[STEP] Applying Box-Cox transform to columns: {boxcox_cols}")
        for col in boxcox_cols:
            if col not in df_transformed.columns:
                print(f"[WARNING] Column '{col}' not found for Box-Cox. Skipping.")
                continue
            if (df_transformed[col] <= 0).any():
                print(f"[WARNING] Column '{col}' has zero or negative values. Skipping Box-Cox.")
                continue
            transformed_series, lam = transform_column_boxcox(df_transformed[col])
            df_transformed[col] = transformed_series
            print(f"    -> Applied Box-Cox to '{col}' with lambda={lam:.4f}")

    # 2.3 YEO-JOHNSON
    if yeojohnson_cols:
        print(f"\n[STEP] Applying Yeo-Johnson transform to columns: {yeojohnson_cols}")
        yj = PowerTransformer(method='yeo-johnson', standardize=False)
        valid_cols = [c for c in yeojohnson_cols if c in df_transformed.columns]
        if not valid_cols:
            print("[WARNING] None of the specified Yeo-Johnson columns found. Skipping.")
        else:
            yj.fit(df_transformed[valid_cols])
            df_transformed[valid_cols] = yj.transform(df_transformed[valid_cols])
            print(f"    -> Applied Yeo-Johnson transform to {valid_cols}")
            print("    -> Fitted lambda values (per column):")
            for c, lam_val in zip(valid_cols, yj.lambdas_):
                print(f"       {c}: lambda={lam_val:.4f}")

    # ------------------------------
    # 3. OPTIONAL SCALING
    # ------------------------------
    if scale_method in ["standard", "robust"]:
        numeric_cols = df_transformed.select_dtypes(include='number').columns
        if scale_method == "standard":
            print(f"\n[STEP] Applying StandardScaler to numeric columns: {list(numeric_cols)}")
            scaler = StandardScaler()
        else:
            print(f"\n[STEP] Applying RobustScaler to numeric columns: {list(numeric_cols)}")
            scaler = RobustScaler()

        df_transformed[numeric_cols] = scaler.fit_transform(df_transformed[numeric_cols])
        print("    -> Scaling complete.")

    print("\n[INFO] All requested transformations have been applied.")
    return df_transformed


# Example usage in a Jupyter cell (no sample data here):
df_post_transform = full_data_transformation_pipeline(
    df=df_transformed,
    winsorize_cols=["us_3m_10y_chg_6m", "us_inflation_surprises_chg_6m"],
    winsor_lower_quant=0.01,
    winsor_upper_quant=0.99,
    log_cols=["vix"],
    boxcox_cols=["cad_oas"],
    yeojohnson_cols=["us_growth_surprises", "tsx_chg_1m"],
    scale_method="standard"
)



[STEP] Winsorizing columns: ['us_3m_10y_chg_6m', 'us_inflation_surprises_chg_6m'] with lower=0.01, upper=0.99
    -> Winsorized 'us_3m_10y_chg_6m'
    -> Winsorized 'us_inflation_surprises_chg_6m'

[STEP] Applying log(1+x) transform to columns: ['vix']
    -> Applied log(1+x) transform to 'vix'

[STEP] Applying Box-Cox transform to columns: ['cad_oas']
    -> Applied Box-Cox to 'cad_oas' with lambda=-0.0739

[STEP] Applying Yeo-Johnson transform to columns: ['us_growth_surprises', 'tsx_chg_1m']
    -> Applied Yeo-Johnson transform to ['us_growth_surprises', 'tsx_chg_1m']
    -> Fitted lambda values (per column):
       us_growth_surprises: lambda=-0.0335
       tsx_chg_1m: lambda=5.5080

[STEP] Applying StandardScaler to numeric columns: ['cad_oas', 'us_hy_oas', 'vix', 'us_growth_surprises', 'us_hard_data_surprises', 'us_equity_revisions', 'us_economic_regime', 'us_ig_oas_chg_1m', 'us_ig_oas_chg_3m', 'us_ig_oas_chg_6m', 'tsx_chg_1m', 'tsx_chg_3m', 'tsx_chg_6m', 'us_3m_10y_chg_1m', 'us

In [22]:
import pandas as pd
import numpy as np
from statsmodels.stats.outliers_influence import variance_inflation_factor

def calculate_vif(df, features):
    """
    Calculate VIF for a given set of features in df.
    Returns a DataFrame with each feature and its corresponding VIF.
    """
    # Drop any rows with missing values to avoid errors
    df_clean = df[features].dropna()
    
    # Convert to numpy array
    X = df_clean.values
    
    # Calculate VIF for each feature
    vif_data = []
    for i, col in enumerate(features):
        vif_val = variance_inflation_factor(X, i)
        vif_data.append((col, vif_val))
    
    vif_df = pd.DataFrame(vif_data, columns=["Feature", "VIF"]).sort_values("VIF", ascending=False)
    return vif_df

def detect_and_fix_multicollinearity(df, threshold=5.0):
    """
    1. Print correlation matrix for numeric features.
    2. Calculate VIF for all numeric features.
    3. Iteratively drop the feature with the highest VIF if above 'threshold'.
    4. Return a new DataFrame with reduced multicollinearity.

    Parameters
    ----------
    df : pd.DataFrame
        The DataFrame containing your features (already transformed if necessary).
    threshold : float
        The VIF threshold above which a feature is considered too collinear.

    Returns
    -------
    pd.DataFrame
        The DataFrame with selected columns after removing highly collinear features.
    """
    # Select only numeric columns
    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    
    print("============================================================")
    print("[STEP 1] CORRELATION MATRIX")
    print("============================================================")
    corr_matrix = df[numeric_cols].corr()
    print(corr_matrix)
    
    # Copy the DataFrame so we don't modify the original
    df_reduced = df.copy()

    print("\n============================================================")
    print("[STEP 2] INITIAL VIF CALCULATION")
    print("============================================================")
    vif_df = calculate_vif(df_reduced, numeric_cols)
    print(vif_df.to_string(index=False))
    
    # Iteratively remove the feature with the highest VIF until all are below threshold
    high_vif_features = []
    iteration = 1
    
    while True:
        max_vif_feature = vif_df.iloc[0]["Feature"]
        max_vif_value = vif_df.iloc[0]["VIF"]
        
        if max_vif_value <= threshold:
            print(f"\n[INFO] All features now have VIF <= {threshold}.")
            break

        # Drop the feature with the highest VIF from the DataFrame
        print(f"\n[STEP] Iteration {iteration}:")
        print(f"    -> Highest VIF is {max_vif_value:.2f} for feature '{max_vif_feature}'. Removing it...")
        df_reduced.drop(columns=[max_vif_feature], inplace=True)
        numeric_cols.remove(max_vif_feature)
        high_vif_features.append((max_vif_feature, max_vif_value))
        
        # Recompute VIF
        vif_df = calculate_vif(df_reduced, numeric_cols)
        print("    -> New VIF summary:")
        print(vif_df.to_string(index=False))
        
        iteration += 1
    
    if high_vif_features:
        print("\n[INFO] The following features were removed due to high VIF:")
        for feat, val in high_vif_features:
            print(f"    -> {feat} (VIF={val:.2f})")
    else:
        print("\n[INFO] No features were removed. No high VIF values found.")
    
    return df_reduced


In [23]:
# Suppose df_post_transform is your DataFrame after transformations:
df_fixed = detect_and_fix_multicollinearity(df_post_transform, threshold=5.0)

# 'df_fixed' will contain only the columns that passed the VIF threshold check.


[STEP 1] CORRELATION MATRIX
                                cad_oas  us_hy_oas       vix  \
cad_oas                        1.000000   0.723250  0.619668   
us_hy_oas                      0.723250   1.000000  0.713434   
vix                            0.619668   0.713434  1.000000   
us_growth_surprises           -0.366419  -0.475904 -0.073849   
us_hard_data_surprises        -0.129828  -0.259653 -0.021815   
...                                 ...        ...       ...   
us_inflation_surprises_chg_3m -0.091665  -0.049422 -0.006404   
us_inflation_surprises_chg_6m -0.009879  -0.017824 -0.050300   
us_lei_yoy_chg_1m              0.050814   0.064127  0.094418   
us_lei_yoy_chg_3m              0.083161   0.095666  0.092460   
us_lei_yoy_chg_6m              0.066176   0.096673  0.071565   

                               us_growth_surprises  us_hard_data_surprises  \
cad_oas                                  -0.366419               -0.129828   
us_hy_oas                                -0.475

In [28]:
import pandas as pd
import numpy as np

# Models
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor

# Metrics
from sklearn.metrics import mean_squared_error, r2_score

def time_based_train_test_split(df, target_col, split_date):
    """
    Splits df into train/test sets at a certain split_date, ensuring no data leakage.
    df must have a DatetimeIndex.
    
    Parameters
    ----------
    df : pd.DataFrame
        Your cleaned/transformed DataFrame (already has a DatetimeIndex).
    target_col : str
        The name of your target variable.
    split_date : str or pd.Timestamp
        The date to separate training and testing sets.

    Returns
    -------
    X_train, X_test, y_train, y_test
    """
    # Sort chronologically
    df = df.sort_index()

    # Create boolean masks
    train_mask = df.index <= pd.to_datetime(split_date)
    test_mask = df.index > pd.to_datetime(split_date)

    # Separate features vs. target
    features = [col for col in df.columns if col != target_col]

    X_train = df.loc[train_mask, features]
    y_train = df.loc[train_mask, target_col]
    
    X_test = df.loc[test_mask, features]
    y_test = df.loc[test_mask, target_col]
    
    print("=======================================")
    print("[TIME-BASED SPLIT]")
    print("=======================================")
    print(f"Training data from {X_train.index.min()} to {X_train.index.max()} "
          f"({len(X_train)} rows)")
    print(f"Testing data from  {X_test.index.min()} to {X_test.index.max()} "
          f"({len(X_test)} rows)")
    
    return X_train, X_test, y_train, y_test


def train_and_evaluate_models(X_train, y_train, X_test, y_test):
    """
    Trains 7 different regression models (excluding XGBoost) and evaluates them on (X_test, y_test).
    Prints out MSE, RMSE, and R² for each.

    Returns
    -------
    results_df : pd.DataFrame
        DataFrame summarizing the performance of each model.
    models_dict : dict
        Dictionary with trained model instances keyed by their names.
    """
    # Define your 7 models
    models = [
        ("LinearRegression", LinearRegression()),
        ("Ridge", Ridge()),
        ("Lasso", Lasso()),
        ("ElasticNet", ElasticNet()),
        ("RandomForest", RandomForestRegressor(random_state=42)),
        ("GradientBoosting", GradientBoostingRegressor(random_state=42)),
        ("ExtraTrees", ExtraTreesRegressor(random_state=42))
    ]
    
    results = []
    models_dict = {}
    
    for (name, model) in models:
        print("=======================================")
        print(f"[TRAINING] {name}")
        print("=======================================")
        
        # Fit the model
        model.fit(X_train, y_train)
        
        # Predict
        y_pred = model.predict(X_test)
        
        # Evaluate
        mse = mean_squared_error(y_test, y_pred)
        rmse = np.sqrt(mse)
        r2 = r2_score(y_test, y_pred)
        
        print(f"  MSE  = {mse:.4f}")
        print(f"  RMSE = {rmse:.4f}")
        print(f"  R²   = {r2:.4f}")
        
        results.append({
            "Model": name,
            "MSE": mse,
            "RMSE": rmse,
            "R2": r2
        })
        
        models_dict[name] = model
    
    # Create a summary DataFrame
    results_df = pd.DataFrame(results)
    return results_df, models_dict


def run_full_modeling_pipeline(
    df_fixed,
    target_col="cad_oas",
    split_date="2018-12-31"
):
    """
    Orchestrates the entire workflow:
      1. Time-based split
      2. Train & evaluate 7 models (excluding XGBoost)
      3. Print a summary table of results

    Parameters
    ----------
    df_fixed : pd.DataFrame
        Your final DataFrame (clean, transformed, with DatetimeIndex).
    target_col : str
        Name of the target column, e.g., 'cad_oas'.
    split_date : str
        Date to split train/test sets, e.g. '2018-12-31'.

    Returns
    -------
    results_df : pd.DataFrame
        Performance metrics for each model.
    models_dict : dict
        Trained model objects, keyed by model name.
    """
    # 1. Time-based split
    X_train, X_test, y_train, y_test = time_based_train_test_split(
        df_fixed, target_col, split_date
    )
    
    # 2. Train & Evaluate 7 models
    results_df, models_dict = train_and_evaluate_models(X_train, y_train, X_test, y_test)
    
    # 3. Show summary
    print("\n=======================================")
    print("[SUMMARY] Model Performance Comparison")
    print("=======================================")
    # If you're in a Jupyter environment, you can use 'display' for a nicer table
    display(results_df.sort_values("RMSE"))
    
    return results_df, models_dict



In [29]:

results_df, models_dict = run_full_modeling_pipeline(
    df_fixed=df_fixed,
    target_col="cad_oas",    # or any other target column you have
    split_date="2018-12-31"  # pick a date that makes sense for your data
)


[TIME-BASED SPLIT]
Training data from 2003-03-31 00:00:00 to 2018-12-31 00:00:00 (244 rows)
Testing data from  2019-01-31 00:00:00 to 2024-11-30 00:00:00 (92 rows)
[TRAINING] LinearRegression
  MSE  = 1.1340
  RMSE = 1.0649
  R²   = -3.8783
[TRAINING] Ridge
  MSE  = 1.0998
  RMSE = 1.0487
  R²   = -3.7312
[TRAINING] Lasso
  MSE  = 0.4286
  RMSE = 0.6547
  R²   = -0.8439
[TRAINING] ElasticNet
  MSE  = 0.4922
  RMSE = 0.7016
  R²   = -1.1174
[TRAINING] RandomForest
  MSE  = 1.3327
  RMSE = 1.1544
  R²   = -4.7333
[TRAINING] GradientBoosting
  MSE  = 1.1427
  RMSE = 1.0690
  R²   = -3.9160
[TRAINING] ExtraTrees
  MSE  = 0.8002
  RMSE = 0.8945
  R²   = -2.4423

[SUMMARY] Model Performance Comparison


Unnamed: 0,Model,MSE,RMSE,R2
2,Lasso,0.428605,0.65468,-0.84387
3,ElasticNet,0.492184,0.701558,-1.117387
6,ExtraTrees,0.800168,0.894521,-2.442337
1,Ridge,1.099757,1.048693,-3.731178
0,LinearRegression,1.133953,1.064872,-3.878288
5,GradientBoosting,1.142711,1.068977,-3.915965
4,RandomForest,1.332698,1.154425,-4.733293


In [30]:
def create_advanced_features(
    df,
    target_col="cad_oas",
    lags=[1, 3, 6],
    rolling_windows=[3, 6]
):
    """
    Automatically generates advanced features for each column (excluding the target):
      - Lag features (e.g., 1-month, 3-month, 6-month lags).
      - Rolling mean and rolling std for given windows.
    
    Parameters
    ----------
    df : pd.DataFrame
        The DataFrame with a DateTimeIndex (sorted) and your features/target columns.
    target_col : str
        Name of the target column to exclude from transformations.
    lags : list of int
        Which lag steps to create for each feature (in 'periods').
    rolling_windows : list of int
        Window sizes for rolling mean/std features.
    
    Returns
    -------
    pd.DataFrame
        A DataFrame with new engineered features added, and aligned so no data leakage occurs.
    
    Notes
    -----
    - The function ensures no transformation is applied to 'target_col'.
    - Each new feature has a suffix indicating its transformation:
        * _lagX: lag by X periods
        * _rmeanX: rolling mean with window X
        * _rstdX: rolling std with window X
    - If you want additional transformations (e.g., rolling min/max, ratio features), 
      you can add them inside the loop.
    - The resulting DataFrame may contain NaNs for early periods (due to lags and rolling).
      Typically, you can drop those rows before modeling.
    """
    # Make a copy so we don't mutate original
    df_features = df.copy()
    
    # Sort by index just to be sure
    df_features.sort_index(inplace=True)
    
    # Identify the columns we want to transform (exclude target)
    feature_cols = [col for col in df_features.columns if col != target_col]
    
    # 1. Create Lag Features
    print("============================================================")
    print("[STEP] Creating Lag Features")
    print("============================================================")
    
    for col in feature_cols:
        for lag in lags:
            lag_col_name = f"{col}_lag{lag}"
            df_features[lag_col_name] = df_features[col].shift(lag)
            print(f"  -> Created {lag_col_name}")
    
    # 2. Create Rolling Window Statistics
    print("\n============================================================")
    print("[STEP] Creating Rolling Window Features (Mean, Std)")
    print("============================================================")
    
    for col in feature_cols:
        for window in rolling_windows:
            # Rolling Mean
            rmean_col_name = f"{col}_rmean{window}"
            df_features[rmean_col_name] = df_features[col].rolling(window=window).mean()
            print(f"  -> Created {rmean_col_name}")
            
            # Rolling Std
            rstd_col_name = f"{col}_rstd{window}"
            df_features[rstd_col_name] = df_features[col].rolling(window=window).std()
            print(f"  -> Created {rstd_col_name}")
    
    # Optional: Additional transformations
    # For instance, rolling min/max, ratio of columns, etc.
    #
    # Example: rolling min
    # for col in feature_cols:
    #     for window in rolling_windows:
    #         rmin_col_name = f"{col}_rmin{window}"
    #         df_features[rmin_col_name] = df_features[col].rolling(window=window).min()
    #         print(f"  -> Created {rmin_col_name}")
    #
    # Example: ratio of two features, say vix / us_hy_oas
    # if "vix" in feature_cols and "us_hy_oas" in feature_cols:
    #     df_features["vix_to_us_hy_oas"] = df_features["vix"] / df_features["us_hy_oas"]
    #     print("  -> Created vix_to_us_hy_oas")
    
    print("\n[INFO] Feature engineering complete. Note that early rows may contain NaN due to lags/rolling.")
    
    return df_features

# Suppose you have df_fixed with a DatetimeIndex and 'cad_oas' as target

df_advanced = create_advanced_features(
    df=df_fixed,
    target_col="cad_oas",   # Or whichever your target is
    lags=[1, 3, 6],        # e.g., 1-, 3-, and 6-month lags
    rolling_windows=[3, 6] # e.g., 3- and 6-month rolling windows
)

print("[INFO] df_advanced shape:", df_advanced.shape)

# Before modeling, you might drop initial NaNs
df_advanced.dropna(inplace=True)
print("[INFO] After dropna, shape:", df_advanced.shape)



[STEP] Creating Lag Features
  -> Created us_hy_oas_lag1
  -> Created us_hy_oas_lag3
  -> Created us_hy_oas_lag6
  -> Created vix_lag1
  -> Created vix_lag3
  -> Created vix_lag6
  -> Created us_growth_surprises_lag1
  -> Created us_growth_surprises_lag3
  -> Created us_growth_surprises_lag6
  -> Created us_hard_data_surprises_lag1
  -> Created us_hard_data_surprises_lag3
  -> Created us_hard_data_surprises_lag6
  -> Created us_equity_revisions_lag1
  -> Created us_equity_revisions_lag3
  -> Created us_equity_revisions_lag6
  -> Created us_economic_regime_lag1
  -> Created us_economic_regime_lag3
  -> Created us_economic_regime_lag6
  -> Created us_ig_oas_chg_1m_lag1
  -> Created us_ig_oas_chg_1m_lag3
  -> Created us_ig_oas_chg_1m_lag6
  -> Created us_ig_oas_chg_6m_lag1
  -> Created us_ig_oas_chg_6m_lag3
  -> Created us_ig_oas_chg_6m_lag6
  -> Created tsx_chg_1m_lag1
  -> Created tsx_chg_1m_lag3
  -> Created tsx_chg_1m_lag6
  -> Created tsx_chg_3m_lag1
  -> Created tsx_chg_3m_lag3
  ->

In [34]:
results_df, models_dict = run_full_modeling_pipeline(
    df_fixed=df_advanced,
    target_col="cad_oas",    # or any other target column you have
    split_date="2018-12-31"  # pick a date that makes sense for your data
)

[TIME-BASED SPLIT]
Training data from 2003-08-29 00:00:00 to 2018-12-31 00:00:00 (238 rows)
Testing data from  2019-01-31 00:00:00 to 2024-11-30 00:00:00 (92 rows)
[TRAINING] LinearRegression
  MSE  = 5.8412
  RMSE = 2.4169
  R²   = -24.1290
[TRAINING] Ridge
  MSE  = 1.2664
  RMSE = 1.1254
  R²   = -4.4483
[TRAINING] Lasso
  MSE  = 0.4249
  RMSE = 0.6518
  R²   = -0.8279
[TRAINING] ElasticNet
  MSE  = 0.5318
  RMSE = 0.7292
  R²   = -1.2878
[TRAINING] RandomForest
  MSE  = 0.9457
  RMSE = 0.9725
  R²   = -3.0685
[TRAINING] GradientBoosting
  MSE  = 0.7975
  RMSE = 0.8930
  R²   = -2.4307
[TRAINING] ExtraTrees
  MSE  = 0.5917
  RMSE = 0.7692
  R²   = -1.5454

[SUMMARY] Model Performance Comparison


Unnamed: 0,Model,MSE,RMSE,R2
2,Lasso,0.424896,0.65184,-0.827909
3,ElasticNet,0.531787,0.729237,-1.287757
6,ExtraTrees,0.591668,0.769199,-1.545366
5,GradientBoosting,0.797454,0.893003,-2.430662
4,RandomForest,0.945713,0.972478,-3.068478
1,Ridge,1.266444,1.125364,-4.448268
0,LinearRegression,5.841218,2.416861,-24.129038
