In [6]:
import numpy as np
import pandas as pd
from datetime import datetime, timedelta
from pycaret.time_series import TSForecastingExperiment
import plotly.graph_objects as go
import math
from plotly.subplots import make_subplots
from datetime import datetime
import sys
from pathlib import Path
from pycaret.time_series import *
import logging
import os

# Add project root to path (assuming notebook is in the notebooks directory)
project_root = Path().absolute().parent
sys.path.append(str(project_root))

# Local imports
from src.core.bloomberg_fetcher import fetch_bloomberg_data
from src import trans_utils  # Using the one from src root since that's what you have open
from src.utils import merge_utils  # Using the one from utils since that's what you have open
from src.utils.csv_exporter import export_to_csv

In [7]:
# Getting all the data 
mapping = {
    ('I05510CA Index', 'INDEX_OAS_TSY_BP'): 'cad_oas',
    ('LF98TRUU Index', 'INDEX_OAS_TSY_BP'): 'us_hy_oas',
    ('LUACTRUU Index', 'INDEX_OAS_TSY_BP'): 'us_ig_oas',
    ('SPTSX Index', 'PX_LAST'): 'tsx',
    ('VIX Index', 'PX_LAST'): 'vix',
    ('USYC3M30 Index', 'PX_LAST'): 'us_3m_10y',
    ('BCMPUSGR Index', 'PX_LAST'): 'us_growth_surprises',
    ('BCMPUSIF Index', 'PX_LAST'): 'us_inflation_surprises',
    ('LEI YOY  Index', 'PX_LAST'): 'us_lei_yoy',
    ('.HARDATA G Index', 'PX_LAST'): 'us_hard_data_surprises',
    ('CGERGLOB Index', 'PX_LAST'): 'us_equity_revisions',
    ('.ECONREGI G Index', 'PX_LAST'): 'us_economic_regime',
 
}

# Calculate dates
end_date = datetime.now().strftime('%Y-%m-%d')
start_date ='2002-01-01'

# Fetch the data
df = fetch_bloomberg_data(
    mapping=mapping,
    start_date=start_date,
    end_date=end_date,
    periodicity='M',
    align_start=True
).dropna()

# Getting all the er_ytd data 
mapping1 = {
    ('I05510CA Index', 'INDEX_EXCESS_RETURN_YTD'): 'cad_ig_er',
    ('LF98TRUU Index', 'INDEX_EXCESS_RETURN_YTD'): 'us_hy_er',
    ('LUACTRUU Index', 'INDEX_EXCESS_RETURN_YTD'): 'us_ig_er',
}

# Fetch the er_ytd_data
df1 = fetch_bloomberg_data(
    mapping=mapping1,
    start_date=start_date,
    end_date=end_date,
    periodicity='M',
    align_start=True
).dropna()

# Conver er_ytd data to an index
df2= trans_utils.convert_er_ytd_to_index(df1[['cad_ig_er','us_hy_er','us_ig_er']])
final_df=merge_utils.merge_dfs(df, df2, fill='ffill', start_date_align='yes')

# Handle bad data point for cad_oas on Nov 15 2005
bad_date = '2005-11-15'
if bad_date in final_df.index:
    final_df.loc[bad_date, 'cad_oas'] = final_df.loc[final_df.index < bad_date, 'cad_oas'].iloc[-1]


#  Fill the first row with 100 (starting index value)
final_df[['cad_ig_er_index', 'us_hy_er_index', 'us_ig_er_index']] = final_df[['cad_ig_er_index', 'us_hy_er_index', 'us_ig_er_index']].fillna(100)
# Drop the er_index columns
final_df = final_df.drop(['cad_ig_er_index', 'us_hy_er_index', 'us_ig_er_index'], axis=1)


# Calculate TSX percent changes for different periods
final_df['tsx_1m'] = final_df['tsx'].pct_change(periods=1) * 100
final_df['tsx_3m'] = final_df['tsx'].pct_change(periods=3) * 100
final_df['tsx_6m'] = final_df['tsx'].pct_change(periods=6) * 100
final_df['tsx_12m'] = final_df['tsx'].pct_change(periods=12) * 100

# Drop the original tsx column
final_df = final_df.drop('tsx', axis=1)

# Drop any NA rows that resulted from the calculations
final_df = final_df.dropna()

# Export the final DataFrame to CSV, overwriting if exists
export_path = export_to_csv(
    data=final_df,
    name='monthly_oas_pycaret',
    export_dir='c:/Users/Eddy/Documents/auto_ml/csv_outputs',
    overwrite=True  # Explicitly set to overwrite existing file
)

print(final_df.info())
print('-----------------------------')
print('-----------------------------')
print(final_df.head())
print('-----------------------------')
print('-----------------------------')
print(final_df.tail())






2025-02-14 19:11:30,553 - INFO - Successfully exported monthly_oas_pycaret to c:/Users/Eddy/Documents/auto_ml/csv_outputs\monthly_oas_pycaret.csv


<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 332 entries, 2003-08-29 to 2025-01-31
Data columns (total 15 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   cad_oas                 332 non-null    float64
 1   us_hy_oas               332 non-null    float64
 2   us_ig_oas               332 non-null    float64
 3   vix                     332 non-null    float64
 4   us_3m_10y               332 non-null    float64
 5   us_growth_surprises     332 non-null    float64
 6   us_inflation_surprises  332 non-null    float64
 7   us_lei_yoy              332 non-null    float64
 8   us_hard_data_surprises  332 non-null    float64
 9   us_equity_revisions     332 non-null    float64
 10  us_economic_regime      332 non-null    float64
 11  tsx_1m                  332 non-null    float64
 12  tsx_3m                  332 non-null    float64
 13  tsx_6m                  332 non-null    float64
 14  tsx_12m                

In [10]:
import logging
import pandas as pd
import os
import pickle
from pycaret.time_series import *

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)

def save_model_to_pickle(model, filename):
    """
    Save a model to a pickle file in the same directory as the notebook.
    
    Args:
        model: The trained model to save
        filename: Name of the pickle file
    """
    try:
        # Save the model in the same directory as the notebook
        filepath = filename
        
        # Save the model
        with open(filepath, 'wb') as f:
            pickle.dump(model, f)
        logging.info(f"Model successfully saved to {os.path.abspath(filepath)}")
    except Exception as e:
        logging.error(f"Error saving model to pickle: {str(e)}")
        raise

def load_and_prepare_data(file_path):
    """
    Load and prepare the data for time series modeling
    
    Args:
        file_path: Path to the CSV file containing the data
        
    Returns:
        pd.DataFrame: Prepared dataframe with datetime index
    """
    try:
        logging.info(f"Loading data from {file_path}")
        df = pd.read_csv(file_path)
        
        # Convert Date column to datetime and set as index
        df['Date'] = pd.to_datetime(df['Date'])
        df.set_index('Date', inplace=True)
        
        # Sort index to ensure chronological order
        df = df.sort_index()
        
        # Ensure monthly frequency and forward fill any missing values
        df = df.asfreq('M', method='ffill')
        
        logging.info(f"Data loaded successfully. Shape: {df.shape}")
        return df
    except Exception as e:
        logging.error(f"Error loading data: {str(e)}")
        raise

def train_predict_model(data):
    """
    Train a time series model, make predictions, and save the model
    
    Args:
        data: DataFrame with the time series data
        
    Returns:
        tuple: (final_model, model_metrics)
    """
    try:
        logging.info("Initializing model setup")
        s = setup(
            data=data,
            target='cad_oas',
            fh=1,
            fold=3,
            seasonal_period=12,
            fold_strategy='expanding',
            transform_target=None,
            session_id=123,
            verbose=True
        )
        
        # Include only the ML models we know work with our setup
        include = [
            'lr_cds_dt', 'en_cds_dt', 'ridge_cds_dt', 'lasso_cds_dt', 'br_cds_dt',  # Linear models
            'huber_cds_dt', 'omp_cds_dt', 'knn_cds_dt',  # Other regression models
            'dt_cds_dt', 'rf_cds_dt', 'et_cds_dt', 'gbr_cds_dt',  # Tree models
            'ada_cds_dt', 'lightgbm_cds_dt', 'catboost_cds_dt'  # Boosting models
        ]
        
        # Compare models
        logging.info("Training and comparing models")
        best_model = compare_models(sort='MASE', include=include)
        
        # Get model performance metrics
        logging.info("Getting model performance metrics")
        model_metrics = pull()
        
        # Finalize the best model
        logging.info("Finalizing best model")
        final_model = finalize_model(best_model)
        
        # Save the model to pickle
        save_model_to_pickle(final_model, 'best_model.pkl')
        
        return final_model, model_metrics
    except Exception as e:
        logging.error(f"Error in model training/prediction: {str(e)}")
        raise

def analyze_model_performance(model, data):
    """
    Analyze model performance metrics and make predictions
    
    Args:
        model: Trained model from PyCaret
        data: Training data DataFrame
    """
    try:
        # Get performance metrics
        metrics = pull()
        print("\nModel Performance Metrics:")
        print(metrics)
        
        # Get exogenous variables for prediction
        exog_data = data.drop('cad_oas', axis=1).iloc[-1:].copy()
        
        # Make predictions
        logging.info("Making predictions")
        predictions = predict_model(model, X=exog_data)
        
        # Save predictions
        csv_path = 'csv_outputs/cad_oas_predictions.csv'
        os.makedirs(os.path.dirname(csv_path), exist_ok=True)  # Create directory if it doesn't exist
        predictions.to_csv(csv_path)
        logging.info(f"Predictions saved to {os.path.abspath(csv_path)}")
        
        # Get last actual and predicted values
        last_actual = data['cad_oas'].iloc[-1]
        # Get the prediction column (should be the last column)
        pred_col = predictions.columns[-1]
        next_pred = predictions[pred_col].iloc[-1]
        
        print("\nPrediction Results:")
        print(f"Last actual value: {last_actual:.4f}")
        print(f"Predicted next value: {next_pred:.4f}")
        
    except Exception as e:
        logging.error(f"Error analyzing model performance: {str(e)}")
        raise

def main():
    """
    Main function to orchestrate the model training and prediction process
    """
    global model  # Use the global model variable
    try:
        # File path
        file_path = 'c:/Users/Eddy/Documents/auto_ml/csv_outputs/monthly_oas_pycaret.csv'
        
        # Load and prepare data
        data = load_and_prepare_data(file_path)
        
        # Train model and get predictions
        model, metrics = train_predict_model(data)
        
        # Analyze model performance
        analyze_model_performance(model, data)
        
    except Exception as e:
        logging.error(f"Error in main execution: {str(e)}")
        raise

if __name__ == "__main__":
    main()

2025-02-14 19:14:41,292 - INFO - Loading data from c:/Users/Eddy/Documents/auto_ml/csv_outputs/monthly_oas_pycaret.csv
2025-02-14 19:14:41,299 - INFO - Data loaded successfully. Shape: (258, 15)
2025-02-14 19:14:41,299 - INFO - Initializing model setup


Unnamed: 0,Description,Value
0,session_id,123
1,Target,cad_oas
2,Approach,Univariate
3,Exogenous Variables,Present
4,Original data shape,"(258, 15)"
5,Transformed data shape,"(258, 15)"
6,Transformed train set shape,"(257, 15)"
7,Transformed test set shape,"(1, 15)"
8,Rows with missing values,0.0%
9,Fold Generator,ExpandingWindowSplitter


2025-02-14 19:14:41,667 - INFO - Training and comparing models


Unnamed: 0,Model,MASE,RMSSE,MAE,RMSE,MAPE,SMAPE,TT (Sec)
ada_cds_dt,AdaBoost w/ Cond. Deseasonalize & Detrending,0.3746,0.2105,3.2114,3.2114,0.0318,0.0311,0.04
et_cds_dt,Extra Trees w/ Cond. Deseasonalize & Detrending,0.4333,0.2434,3.7166,3.7166,0.0357,0.035,0.0533
gbr_cds_dt,Gradient Boosting w/ Cond. Deseasonalize & Detrending,0.5127,0.2879,4.3973,4.3973,0.0423,0.0409,0.06
omp_cds_dt,Orthogonal Matching Pursuit w/ Cond. Deseasonalize & Detrending,0.5342,0.3001,4.5829,4.5829,0.044,0.0443,0.0233
rf_cds_dt,Random Forest w/ Cond. Deseasonalize & Detrending,0.5545,0.3114,4.7558,4.7558,0.0457,0.0441,0.0633
lightgbm_cds_dt,Light Gradient Boosting w/ Cond. Deseasonalize & Detrending,0.5932,0.3332,5.0883,5.0883,0.049,0.048,0.18
br_cds_dt,Bayesian Ridge w/ Cond. Deseasonalize & Detrending,0.6151,0.3457,5.2736,5.2736,0.0524,0.0507,0.0267
catboost_cds_dt,CatBoost Regressor w/ Cond. Deseasonalize & Detrending,0.6363,0.3575,5.4579,5.4579,0.0528,0.0514,0.8167
en_cds_dt,Elastic Net w/ Cond. Deseasonalize & Detrending,0.6471,0.3636,5.5474,5.5474,0.0552,0.0532,0.0267
lasso_cds_dt,Lasso w/ Cond. Deseasonalize & Detrending,0.6658,0.3742,5.7081,5.7081,0.0567,0.0547,0.0267


2025-02-14 19:14:48,518 - INFO - Getting model performance metrics
2025-02-14 19:14:48,518 - INFO - Finalizing best model
2025-02-14 19:14:48,872 - INFO - Model successfully saved to c:\Users\Eddy\Documents\auto_ml\notebooks\best_model.pkl
2025-02-14 19:14:48,875 - INFO - Making predictions
2025-02-14 19:14:49,035 - INFO - Predictions saved to c:\Users\Eddy\Documents\auto_ml\notebooks\csv_outputs\cad_oas_predictions.csv



Model Performance Metrics:
                                                             Model    MASE  \
ada_cds_dt            AdaBoost w/ Cond. Deseasonalize & Detrending  0.3746   
et_cds_dt          Extra Trees w/ Cond. Deseasonalize & Detrending  0.4333   
gbr_cds_dt       Gradient Boosting w/ Cond. Deseasonalize & Det...  0.5127   
omp_cds_dt       Orthogonal Matching Pursuit w/ Cond. Deseasona...  0.5342   
rf_cds_dt        Random Forest w/ Cond. Deseasonalize & Detrending  0.5545   
lightgbm_cds_dt  Light Gradient Boosting w/ Cond. Deseasonalize...  0.5932   
br_cds_dt        Bayesian Ridge w/ Cond. Deseasonalize & Detren...  0.6151   
catboost_cds_dt  CatBoost Regressor w/ Cond. Deseasonalize & De...  0.6363   
en_cds_dt          Elastic Net w/ Cond. Deseasonalize & Detrending  0.6471   
lasso_cds_dt             Lasso w/ Cond. Deseasonalize & Detrending  0.6658   
huber_cds_dt             Huber w/ Cond. Deseasonalize & Detrending  0.8132   
dt_cds_dt        Decision Tree w/ Co

In [17]:
import pickle
import pandas as pd
from typing import Union, List
import matplotlib.pyplot as plt
import warnings  # Import the warnings module
from pycaret.time_series import predict_model
import plotly.express as px

def generate_pycaret_plots(model, data=None, plot_types: Union[str, List[str]] = 'all'):
    """
    Generate and display PyCaret time series plots in the notebook with proper error handling.
    
    Args:
        model: Trained PyCaret model
        data: Optional data for plots that require it
        plot_types: Either 'all' or list of specific plots to generate
    """
    # All available plot types in PyCaret time series
    all_plot_types = [
        'ts', 'train_test_split', 'residuals', 'forecast',
        'diagnostics', 'decomp', 'acf', 'pacf', 'decomp_stl'
    ]
    
    if plot_types == 'all':
        plot_types = all_plot_types
    
    results = {}
    
    for plot_type in plot_types:
        print(f"\nAttempting to generate {plot_type} plot...")
        try:
            with warnings.catch_warnings(record=True) as w:
                warnings.simplefilter("always")
                
                # Special handling for forecast plot
                if plot_type == 'forecast' and data is not None:
                    # Get all columns except the target
                    exog_cols = [col for col in data.columns if col != 'cad_oas']
                    forecast_data = data[exog_cols].iloc[[-1]]  # Get last row for forecasting
                    # Use predict_model to generate forecast plot
                    predictions = predict_model(model, X=forecast_data)
                    print(predictions)  # Print predictions for debugging
                    
                    # Combine actual data and predictions
                    concat_df = pd.concat([data[['cad_oas']], predictions], axis=0)
                    
                    # Set index
                    concat_df_i = pd.date_range(start=data.index.min(), periods=len(concat_df), freq='M')
                    concat_df.set_index(concat_df_i, inplace=True)
                    
                    # Plot the data using plotly
                    fig = px.line(concat_df, x=concat_df.index, y=["cad_oas", "y_pred"], template='plotly_dark')
                    fig.show()
                    
                    results[plot_type] = {
                        'status': 'success',
                        'warnings': [str(warn.message) for warn in w]
                    }
                    print(f"✓ Successfully generated {plot_type} plot (using predict_model)")
                else:
                    # The plot_model function is assumed to be available in your environment
                    plot = plot_model(model, plot=plot_type)
                    # Ensure plot is displayed
                    plt.show()
                
                    results[plot_type] = {
                        'status': 'success',
                        'warnings': [str(warn.message) for warn in w]
                    }
                    print(f"✓ Successfully generated {plot_type} plot")
                
                # Print any warnings that occurred
                if w:
                    print("Warnings generated:")
                    for warning in w:
                        print(f"  - {warning.message}")
                        
        except Exception as e:
            results[plot_type] = {
                'status': 'failed',
                'error': str(e)
            }
            print(f"✗ Failed to generate {plot_type} plot")
            print(f"  Error: {str(e)}")
            
        # Clear the current plot to prevent overlapping
        plt.close('all')
    
    # Print summary
    print("\nPlot Generation Summary:")
    print("=" * 50)
    successful = sum(1 for r in results.values() if r['status'] == 'success')
    print(f"Successfully generated: {successful}/{len(plot_types)} plots")
    
    # Return results dictionary for further analysis if needed
    return results

# Execute the plot generation
print("Starting plot generation...")
try:
    plot_results = generate_pycaret_plots(
        model=best_model_loaded,
        data=final_df  # Pass the data for forecast plot
    )
    
    # Print detailed results for failed plots
    failed_plots = {k: v for k, v in plot_results.items() if v['status'] == 'failed'}
    if failed_plots:
        print("\nDetailed error information for failed plots:")
        for plot_type, result in failed_plots.items():
            print(f"\n{plot_type}:")
            print(f"Error: {result['error']}")
            
except Exception as e:
    print(f"An unexpected error occurred: {str(e)}")

Starting plot generation...

Attempting to generate ts plot...


✓ Successfully generated ts plot

Attempting to generate train_test_split plot...


✓ Successfully generated train_test_split plot

Attempting to generate residuals plot...
In sample predictions has not been implemented for this estimator of type 'ForecastingPipeline' in `sktime`. When this is implemented, it will be enabled by default in pycaret.
In sample predictions has not been implemented for this estimator of type 'ForecastingPipeline' in `sktime`. When this is implemented, it will be enabled by default in pycaret.
✓ Successfully generated residuals plot

Attempting to generate forecast plot...
          y_pred
2025-02  97.7611


✓ Successfully generated forecast plot (using predict_model)

Attempting to generate diagnostics plot...
In sample predictions has not been implemented for this estimator of type 'ForecastingPipeline' in `sktime`. When this is implemented, it will be enabled by default in pycaret.
In sample predictions has not been implemented for this estimator of type 'ForecastingPipeline' in `sktime`. When this is implemented, it will be enabled by default in pycaret.
✓ Successfully generated diagnostics plot

Attempting to generate decomp plot...
In sample predictions has not been implemented for this estimator of type 'ForecastingPipeline' in `sktime`. When this is implemented, it will be enabled by default in pycaret.
In sample predictions has not been implemented for this estimator of type 'ForecastingPipeline' in `sktime`. When this is implemented, it will be enabled by default in pycaret.
✓ Successfully generated decomp plot

Attempting to generate acf plot...
In sample predictions has not bee