In [94]:
from xbbg import blp
import vectorbt as vbt
import numpy as np
import pandas as pd
import datetime
import plotly.express as px
from xbbg import blp
import os
import quantstats as qs
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Import custom modules with an alias
import bloomberg_data as bd
import transformations as tr


In [95]:
# Getting the data from the bloomberg_data module 
tickers = ['.MIDERCAD U Index', '.CADIG F Index', 'VIX Index','.HYUSER U Index','.IGUSER U Index']
fields = [['PX_LAST'], ['PX_LAST'], ['PX_LAST'],['PX_LAST'], ['PX_LAST']]
start_date = '2000-01-01'
end_date = '2025-12-31'
column_names = [['cad_ig_er_index'], ['cad_ig_sprds'], ['vix'], ['us_hy_er_index'], ['us_ig_er_index']]
frequencies = ['D', 'D', 'D','D','D']  # You can edit the frequency for each ticker here

dataframes = []

for ticker, field, col_name, freq in zip(tickers, fields, column_names, frequencies):
    df = bd.get_single_ticker_data(ticker, field, start_date, end_date, freq=freq, column_names=col_name)
    dataframes.append(df)

# Getting risk-free index
rate_df = bd.get_single_ticker_data('GCAN3M Index', ['PX_LAST'], start_date, end_date)
risk_free_idx = tr.risk_free_index(rate_df,col_name="risk_free")  # Ensure the default col_name is applied

# Merge all dataframes including the risk-free index
merged_data = bd.merge_dataframes(dataframes)
merged_data = bd.merge_dataframes([merged_data, risk_free_idx])

# Print the final merged data and its information
print(merged_data)
print('----------------------------------------------------------------')
print('----------------------------------------------------------------')
print(merged_data.info())

# Rename
data= merged_data

2024-06-30 12:28:47,897 - INFO - Successfully retrieved data for ticker: .MIDERCAD U Index
2024-06-30 12:28:48,551 - INFO - Successfully retrieved data for ticker: .CADIG F Index
2024-06-30 12:28:48,923 - INFO - Successfully retrieved data for ticker: VIX Index
2024-06-30 12:28:49,459 - INFO - Successfully retrieved data for ticker: .HYUSER U Index
2024-06-30 12:28:50,399 - INFO - Successfully retrieved data for ticker: .IGUSER U Index
2024-06-30 12:28:50,765 - INFO - Successfully retrieved data for ticker: GCAN3M Index
2024-06-30 12:28:50,780 - INFO - Merged 5 dataframes using inner method.
2024-06-30 12:28:50,782 - INFO - Merged 2 dataframes using inner method.


            cad_ig_er_index  cad_ig_sprds    vix  us_hy_er_index  \
2002-11-29           1.0143       69.8153  27.50          0.4183   
2002-12-31           1.0146       77.3398  28.62          0.4134   
2003-01-31           1.0155       74.8880  31.17          0.4285   
2003-02-28           1.0159      106.9295  29.63          0.4265   
2003-03-31           1.0142      117.3892  29.15          0.4406   
...                     ...           ...    ...             ...   
2024-06-24           1.3922      120.6286  13.33          1.1202   
2024-06-25           1.3924      120.6766  12.84          1.1198   
2024-06-26           1.3935      120.6997  12.55          1.1236   
2024-06-27           1.3933      120.9534  12.24          1.1215   
2024-06-28           1.3925      120.2679  12.44          1.1270   

            us_ig_er_index  risk_free_index  
2002-11-29          1.0150       112.289960  
2002-12-31          1.0195       112.557157  
2003-01-31          1.0269       112.816244  

In [96]:
# Reset the index to move the date from the index to a column
data_reset = data.reset_index()
data_reset.rename(columns={'index': 'Date'}, inplace=True)

# Ensure the Outputs directory exists
output_dir = 'Outputs'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Save the DataFrame to a CSV file in the Outputs folder
output_file_path = os.path.join(output_dir, 'daily_credit_ml.csv')
data_reset.to_csv(output_file_path, index=False)

print(f"DataFrame saved to {output_file_path}")


DataFrame saved to Outputs\daily_credit_ml.csv


In [97]:
print(data.info())

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 2033 entries, 2002-11-29 to 2024-06-28
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   cad_ig_er_index  2033 non-null   float64
 1   cad_ig_sprds     2033 non-null   float64
 2   vix              2033 non-null   float64
 3   us_hy_er_index   2033 non-null   float64
 4   us_ig_er_index   2033 non-null   float64
 5   risk_free_index  2033 non-null   float64
dtypes: float64(6)
memory usage: 111.2 KB
None


In [99]:
import pandas as pd
import numpy as np
import vectorbt as vbt

# 1. Data Preparation
close_data = data[['cad_ig_er_index', 'us_ig_er_index', 'us_hy_er_index', 'risk_free_index']]
close_data.index = pd.to_datetime(close_data.index)
close_data = close_data.sort_index()

# 2. Signal Generation (Modified to use boolean operations)
def generate_signal(close, lookback, return_threshold, vol_threshold):
    us_ig_returns = close['us_ig_er_index'].pct_change(lookback)
    us_hy_returns = close['us_hy_er_index'].pct_change(lookback)
    
    # Calculate 20-period lookback volatility
    us_ig_vol = close['us_ig_er_index'].pct_change().rolling(lookback).std()
    us_hy_vol = close['us_hy_er_index'].pct_change().rolling(lookback).std()
    
    # Generate signal based on returns and volatility
    signal = ((us_ig_returns > return_threshold) & (us_hy_returns > return_threshold) & 
              (us_ig_vol < vol_threshold) & (us_hy_vol < vol_threshold))
    
    # Ensure the signal is boolean
    signal = signal.fillna(False)
    
    # Debug: Print signal statistics
    print(f"Signal sum: {signal.sum()}")
    print(f"Signal mean: {signal.mean()}")
    print(f"Number of trade entries: {(signal.astype(int).diff() == 1).sum()}")
    
    return signal

# 3. Strategy Implementation
lookback = 20
return_threshold = 0.01
vol_threshold = 0.02  # Example threshold for volatility, you can adjust this

signal = generate_signal(close_data, lookback, return_threshold, vol_threshold)

# Ensure signal is boolean
signal = signal.astype(bool)

# 4. Portfolio Creation (Modified to use boolean operations)
portfolio = vbt.Portfolio.from_signals(
    close=close_data['cad_ig_er_index'],
    entries=signal.shift(1).fillna(False),  # Enter the day after the signal
    exits=signal.shift(1).fillna(False) & (~signal).fillna(False),  # Exit when signal changes from True to False
    init_cash=100000,
    fees=0,
    freq='D'
)

# 5. Generate and print portfolio stats
try:
    stats = portfolio.stats()
    print("\nPortfolio Statistics:")
    print(stats)
except Exception as e:
    print(f"An error occurred while generating portfolio stats: {str(e)}")

# 6. Benchmark Creation (Buy-and-Hold)
benchmark = vbt.Portfolio.from_holding(
    close_data['cad_ig_er_index'],
    init_cash=100000,
    freq='D'
)

# 7. Generate and print benchmark stats
try:
    benchmark_stats = benchmark.stats()
    print("\nBuy-and-Hold Benchmark Statistics:")
    print(benchmark_stats)
except Exception as e:
    print(f"An error occurred while generating benchmark stats: {str(e)}")

# Print start and end dates for the initial strategy
print("\nInitial Strategy Date Range:")
print(f"Start Date: {portfolio.wrapper.index[0]}")
print(f"End Date: {portfolio.wrapper.index[-1]}")

# Print start and end dates for the benchmark
print("\nBenchmark Date Range:")
print(f"Start Date: {benchmark.wrapper.index[0]}")
print(f"End Date: {benchmark.wrapper.index[-1]}")

# Print the date range of the original data
print("\nOriginal Data Date Range:")
print(f"Start Date: {close_data.index[0]}")
print(f"End Date: {close_data.index[-1]}")

# Check if all strategies cover the same date range
if (portfolio.wrapper.index[0] == benchmark.wrapper.index[0] == close_data.index[0] and
    portfolio.wrapper.index[-1] == benchmark.wrapper.index[-1] == close_data.index[-1]):
    print("\nAll strategies and data cover the same date range.")
else:
    print("\nWarning: Date ranges may not be consistent across all strategies and data.")
    
# Print the total number of trading days
total_days = len(close_data)
print(f"\nTotal number of trading days: {total_days}")

# Improved Strategy Comparison Print
print("\nStrategy Comparison:")
print(f"{'Strategy':<15} {'Total Return':<15} {'Sharpe Ratio':<15} {'Max Drawdown':<15} {'Start Date':<20} {'End Date':<20} {'Variable Name':<15}")
print("-" * 115)

# Function to format date
def format_date(date):
    return date.strftime('%Y-%m-%d')

# Assuming you have stats and benchmark_stats from previous calculations
print(f"{'Initial':<15} {stats['Total Return [%]']:15.2f} {stats.get('Sharpe Ratio', 'N/A'):15.2f} {stats['Max Drawdown [%]']:15.2f} {format_date(portfolio.wrapper.index[0]):<20} {format_date(portfolio.wrapper.index[-1]):<20} {'portfolio':<15}")
print(f"{'Benchmark':<15} {benchmark_stats['Total Return [%]']:15.2f} {benchmark_stats.get('Sharpe Ratio', 'N/A'):15.2f} {benchmark_stats['Max Drawdown [%]']:15.2f} {format_date(benchmark.wrapper.index[0]):<20} {format_date(benchmark.wrapper.index[-1]):<20} {'benchmark':<15}")

# Print date range check
if (portfolio.wrapper.index[0] == benchmark.wrapper.index[0] and
    portfolio.wrapper.index[-1] == benchmark.wrapper.index[-1]):
    print("\nAll strategies cover the same date range.")
else:
    print("\nWarning: Date ranges are not consistent across all strategies.")

# Print total number of trading days
total_days = len(close_data)
print(f"\nTotal number of trading days: {total_days}")


Signal sum: 226
Signal mean: 0.11116576487948844
Number of trade entries: 39

Portfolio Statistics:
Start                         2002-11-29 00:00:00
End                           2024-06-28 00:00:00
Period                         2033 days 00:00:00
Start Value                              100000.0
End Value                           133714.230843
Total Return [%]                        33.714231
Benchmark Return [%]                    37.286799
Max Gross Exposure [%]                      100.0
Total Fees Paid                               0.0
Max Drawdown [%]                        20.853258
Max Drawdown Duration           296 days 00:00:00
Total Trades                                    1
Total Closed Trades                             0
Total Open Trades                               1
Open Trade PnL                       33714.230843
Win Rate [%]                                  NaN
Best Trade [%]                                NaN
Worst Trade [%]                               NaN


In [82]:
# ML Methods

# 1. Data Preparation
close_data = data[['cad_ig_er_index', 'us_ig_er_index', 'us_hy_er_index', 'risk_free_index']]
close_data.index = pd.to_datetime(close_data.index)
close_data = close_data.sort_index()

# Feature Engineering
def create_features(close_data, lookback):
    features = pd.DataFrame(index=close_data.index)
    features['us_ig_returns'] = close_data['us_ig_er_index'].pct_change(lookback)
    features['us_hy_returns'] = close_data['us_hy_er_index'].pct_change(lookback)
    features['us_ig_volatility'] = close_data['us_ig_er_index'].pct_change().rolling(window=lookback).std()
    features['us_hy_volatility'] = close_data['us_hy_er_index'].pct_change().rolling(window=lookback).std()
    features['target'] = (close_data['cad_ig_er_index'].pct_change(lookback).shift(-lookback) > 0).astype(int)
    features = features.dropna()
    return features

lookback = 20
features = create_features(close_data, lookback)

# Data Preprocessing
X = features.drop(columns=['target'])
y = features['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

# Model Training
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predictions
y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)

# Evaluation
print("Training Accuracy:", accuracy_score(y_train, y_pred_train))
print("Testing Accuracy:", accuracy_score(y_test, y_pred_test))
print("\nClassification Report:\n", classification_report(y_test, y_pred_test))

# Signal Generation
signals = model.predict(X)
features['signal'] = signals

# Align signals with the original data
signals_series = pd.Series(index=features.index, data=features['signal'])




Training Accuracy: 1.0
Testing Accuracy: 0.6823821339950372

Classification Report:
               precision    recall  f1-score   support

           0       0.38      0.46      0.42        99
           1       0.81      0.75      0.78       304

    accuracy                           0.68       403
   macro avg       0.60      0.61      0.60       403
weighted avg       0.71      0.68      0.69       403



## Model Performance Summary

### Training Accuracy
- **Training Accuracy: 1.0**: This indicates that the model perfectly predicted the target variable on the training dataset. While this may seem ideal, it is often a sign of overfitting, meaning the model has learned the training data too well, including noise and outliers, which may not generalize well to new, unseen data.

### Testing Accuracy
- **Testing Accuracy: 0.6824**: This shows that the model correctly predicted the target variable approximately 68.2% of the time on the testing dataset. This performance is significantly lower than the training accuracy, reinforcing the possibility that the model has overfitted to the training data.

### Classification Report
The classification report provides a detailed breakdown of the model's performance for each class (0 and 1) on the testing dataset. The key metrics include precision, recall, f1-score, and support.

#### Class 0 (No Signal)
- **Precision: 0.38**: Of all the instances the model predicted as class 0, only 38% were actually class 0. This low precision indicates a high false positive rate.
- **Recall: 0.46**: Of all the actual class 0 instances, the model correctly identified 46%. This means the model missed 54% of the actual class 0 instances (high false negative rate).
- **F1-Score: 0.42**: The harmonic mean of precision and recall, indicating a balance between the two. A low f1-score suggests the model struggles to correctly classify class 0.
- **Support: 99**: The number of actual instances of class 0 in the testing dataset.

#### Class 1 (Signal)
- **Precision: 0.81**: Of all the instances the model predicted as class 1, 81% were actually class 1. This high precision indicates a low false positive rate.
- **Recall: 0.75**: Of all the actual class 1 instances, the model correctly identified 75%. This means the model missed 25% of the actual class 1 instances.
- **F1-Score: 0.78**: The harmonic mean of precision and recall, suggesting the model performs well in identifying class 1.
- **Support: 304**: The number of actual instances of class 1 in the testing dataset.

#### Overall Metrics
- **Accuracy: 0.68**: The overall proportion of correctly classified instances in the testing dataset.
- **Macro Average**: The average of precision, recall, and f1-score for both classes, giving equal weight to each class.
  - **Precision: 0.60**
  - **Recall: 0.61**
  - **F1-Score: 0.60**
- **Weighted Average**: The average of precision, recall, and f1-score, weighted by the number of instances in each class. This gives more importance to the performance on the larger class (class 1 in this case).
  - **Precision: 0.71**
  - **Recall: 0.68**
  - **F1-Score: 0.69**

### Interpretation
- **Overfitting**: The model performs perfectly on the training data but considerably worse on the testing data, indicating it has overfitted the training data and may not generalize well.
- **Class Imbalance**: The support values show there are more instances of class 1 (304) than class 0 (99) in the testing dataset. This imbalance can affect the performance metrics and might require techniques such as resampling, class weighting, or using specialized algorithms to address it.
- **Model Performance**: The model has a relatively high precision and recall for class 1 but performs poorly on class 0. This suggests the model is better at predicting the presence of a signal (class 1) than the absence of it (class 0).

### Next Steps
To improve the model and address the identified issues:
1. **Address Overfitting**: Use techniques like cross-validation, pruning the decision trees, or using fewer estimators.
2. **Handle Class Imbalance**: Implement resampling techniques like SMOTE or adjust the class weights in the Random Forest model.
3. **Feature Engineering**: Explore additional features that might help the model distinguish between the classes better.
4. **Model Tuning**: Perform hyperparameter tuning to find the optimal settings for the Random Forest classifier.
5. **Experiment with Other Models**: Try other classification algorithms such as Gradient Boosting, SVM, or neural networks to see if they perform better.


In [84]:
import pandas as pd
import numpy as np
import vectorbt as vbt
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# 1. Data Preparation
close_data = data[['cad_ig_er_index', 'us_ig_er_index', 'us_hy_er_index', 'risk_free_index']]
close_data.index = pd.to_datetime(close_data.index)
close_data = close_data.sort_index()

# Feature Engineering: Adding volatility
lookback_vol = 20
close_data['us_ig_vol'] = close_data['us_ig_er_index'].pct_change().rolling(lookback_vol).std()
close_data['us_hy_vol'] = close_data['us_hy_er_index'].pct_change().rolling(lookback_vol).std()
close_data = close_data.dropna()

# Creating the feature set and target variable
X = close_data[['us_ig_er_index', 'us_hy_er_index', 'us_ig_vol', 'us_hy_vol']]
y = (close_data['cad_ig_er_index'].pct_change(1).shift(-1) > 0).astype(int)  # Binary target for next day return
y = y.dropna()
X = X.loc[y.index]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Handle class imbalance using SMOTE
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

# Hyperparameter tuning for Random Forest
param_grid_rf = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}

rf = RandomForestClassifier(random_state=42)
grid_search_rf = GridSearchCV(rf, param_grid_rf, cv=5, scoring='accuracy')
grid_search_rf.fit(X_train_res, y_train_res)

# Best parameters for Random Forest
best_rf = grid_search_rf.best_estimator_

# Evaluate the Random Forest model
y_pred_train_rf = best_rf.predict(X_train_res)
y_pred_test_rf = best_rf.predict(X_test)

print("Random Forest Training Accuracy:", best_rf.score(X_train_res, y_train_res))
print("Random Forest Testing Accuracy:", best_rf.score(X_test, y_test))
print("\nRandom Forest Classification Report:")
print(classification_report(y_test, y_pred_test_rf))

# Experiment with other models
# Gradient Boosting
param_grid_gb = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7]
}

gb = GradientBoostingClassifier(random_state=42)
grid_search_gb = GridSearchCV(gb, param_grid_gb, cv=5, scoring='accuracy')
grid_search_gb.fit(X_train_res, y_train_res)

# Best parameters for Gradient Boosting
best_gb = grid_search_gb.best_estimator_

# Evaluate the Gradient Boosting model
y_pred_train_gb = best_gb.predict(X_train_res)
y_pred_test_gb = best_gb.predict(X_test)

print("Gradient Boosting Training Accuracy:", best_gb.score(X_train_res, y_train_res))
print("Gradient Boosting Testing Accuracy:", best_gb.score(X_test, y_test))
print("\nGradient Boosting Classification Report:")
print(classification_report(y_test, y_pred_test_gb))

# Support Vector Machine
param_grid_svc = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf']
}

svc = SVC(random_state=42)
grid_search_svc = GridSearchCV(svc, param_grid_svc, cv=5, scoring='accuracy')
grid_search_svc.fit(X_train_res, y_train_res)

# Best parameters for SVC
best_svc = grid_search_svc.best_estimator_

# Evaluate the SVC model
y_pred_train_svc = best_svc.predict(X_train_res)
y_pred_test_svc = best_svc.predict(X_test)

print("SVC Training Accuracy:", best_svc.score(X_train_res, y_train_res))
print("SVC Testing Accuracy:", best_svc.score(X_test, y_test))
print("\nSVC Classification Report:")
print(classification_report(y_test, y_pred_test_svc))


Random Forest Training Accuracy: 0.9426136363636364
Random Forest Testing Accuracy: 0.5732009925558312

Random Forest Classification Report:
              precision    recall  f1-score   support

           0       0.51      0.55      0.53       177
           1       0.63      0.59      0.61       226

    accuracy                           0.57       403
   macro avg       0.57      0.57      0.57       403
weighted avg       0.58      0.57      0.57       403

Gradient Boosting Training Accuracy: 0.9982954545454545
Gradient Boosting Testing Accuracy: 0.5781637717121588

Gradient Boosting Classification Report:
              precision    recall  f1-score   support

           0       0.52      0.55      0.54       177
           1       0.63      0.60      0.61       226

    accuracy                           0.58       403
   macro avg       0.57      0.58      0.57       403
weighted avg       0.58      0.58      0.58       403

SVC Training Accuracy: 0.5039772727272728
SVC Testin

### Model Performance Analysis

#### Random Forest
- **Training Accuracy:** 94.26%
  - The Random Forest model performs very well on the training data, correctly predicting the target variable 94.26% of the time.
- **Testing Accuracy:** 57.32%
  - This lower accuracy on the test data suggests that the model might be overfitting, capturing noise in the training data that doesn't generalize well to new, unseen data.

**Classification Report:**
- **Class 0 (Not Investing):**
  - **Precision:** 0.51 (51% of the predicted non-investing signals are correct)
  - **Recall:** 0.55 (55% of actual non-investing signals are correctly identified)
  - **F1-score:** 0.53 (harmonic mean of precision and recall)
- **Class 1 (Investing):**
  - **Precision:** 0.63 (63% of the predicted investing signals are correct)
  - **Recall:** 0.59 (59% of actual investing signals are correctly identified)
  - **F1-score:** 0.61

#### Gradient Boosting
- **Training Accuracy:** 99.83%
  - The Gradient Boosting model performs extremely well on the training data, which is an indicator of potential overfitting.
- **Testing Accuracy:** 57.82%
  - Slightly better than Random Forest on test data but still indicates overfitting.

**Classification Report:**
- **Class 0 (Not Investing):**
  - **Precision:** 0.52 (52% of the predicted non-investing signals are correct)
  - **Recall:** 0.55 (55% of actual non-investing signals are correctly identified)
  - **F1-score:** 0.54
- **Class 1 (Investing):**
  - **Precision:** 0.63 (63% of the predicted investing signals are correct)
  - **Recall:** 0.60 (60% of actual investing signals are correctly identified)
  - **F1-score:** 0.61

#### Support Vector Classifier (SVC)
- **Training Accuracy:** 50.40%
  - The SVC model struggles to learn from the training data, likely due to the imbalance in the dataset or inappropriate hyperparameters.
- **Testing Accuracy:** 50.12%
  - The SVC model performs slightly better than random guessing on the test data.

**Classification Report:**
- **Class 0 (Not Investing):**
  - **Precision:** 0.44 (44% of the predicted non-investing signals are correct)
  - **Recall:** 0.51 (51% of actual non-investing signals are correctly identified)
  - **F1-score:** 0.47
- **Class 1 (Investing):**
  - **Precision:** 0.56 (56% of the predicted investing signals are correct)
  - **Recall:** 0.50 (50% of actual investing signals are correctly identified)
  - **F1-score:** 0.53


#### Analysis
Overfitting: Both the Random Forest and Gradient Boosting classifiers exhibit high training accuracy but much lower testing accuracy, indicating overfitting. The models are capturing noise in the training data that doesn’t generalize well.
Performance: The testing accuracies for all models are relatively low (around 50-58%), indicating that the models are not performing well on unseen data. This could be due to:
Imbalanced classes
Insufficient or inappropriate features
Need for better hyperparameter tuning
Model Selection: Given the performance, further steps can include:
Balancing the dataset more effectively
Feature engineering to add more relevant features
Trying different models or ensemble methods
Fine-tuning hyperparameters with more extensive grid search or random search


#### Next Steps
Balance the Dataset: Explore more advanced techniques for balancing the dataset, such as ensemble methods specifically designed for imbalanced data.
Feature Engineering: Add more features that might capture the underlying patterns better, such as moving averages, other technical indicators, etc.
Hyperparameter Tuning: Use more sophisticated hyperparameter tuning methods like Random Search or Bayesian Optimization.
Model Evaluation: Implement cross-validation to better evaluate model performance and ensure that it generalizes well to unseen data.

In [87]:
import pandas as pd
import numpy as np
import vectorbt as vbt
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from imblearn.ensemble import BalancedRandomForestClassifier
from imblearn.over_sampling import SMOTE

# Assuming close_data and signal are already defined
# Add the moving average features
close_data['us_ig_ma20'] = close_data['us_ig_er_index'].rolling(window=20).mean()
close_data['us_hy_ma20'] = close_data['us_hy_er_index'].rolling(window=20).mean()

# Drop NaN values and align X and y
X = close_data[['us_ig_er_index', 'us_hy_er_index', 'us_ig_ma20', 'us_hy_ma20']].dropna()
y = signal.loc[X.index]

# Ensure X and y have the same index
X = X.loc[y.index]

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Balance the Dataset using Balanced Random Forest Classifier
brf = BalancedRandomForestClassifier(n_estimators=100, random_state=42)
brf.fit(X_train, y_train)

brf_pred_train = brf.predict(X_train)
brf_pred_test = brf.predict(X_test)

print("Balanced Random Forest Training Accuracy:", brf.score(X_train, y_train))
print("Balanced Random Forest Testing Accuracy:", brf.score(X_test, y_test))

print("\nBalanced Random Forest Classification Report:")
print(classification_report(y_test, brf_pred_test))

# Hyperparameter Tuning using Random Search
param_dist = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

random_search_rf = RandomizedSearchCV(RandomForestClassifier(random_state=42), param_distributions=param_dist, n_iter=100, cv=5, verbose=2, n_jobs=-1)
random_search_rf.fit(X_train, y_train)

print("Best parameters found for Random Forest:", random_search_rf.best_params_)
print("Best cross-validation score for Random Forest:", random_search_rf.best_score_)

# Model Evaluation using Cross-Validation
rf = RandomForestClassifier(**random_search_rf.best_params_)
cv_scores = cross_val_score(rf, X_train, y_train, cv=5)
print(f"Cross-validation scores: {cv_scores}")
print(f"Mean cross-validation score: {cv_scores.mean()}")

# Retrain the model on the entire training set
rf.fit(X_train, y_train)
rf_pred_train = rf.predict(X_train)
rf_pred_test = rf.predict(X_test)

print("Random Forest Training Accuracy:", rf.score(X_train, y_train))
print("Random Forest Testing Accuracy:", rf.score(X_test, y_test))

print("\nRandom Forest Classification Report:")
print(classification_report(y_test, rf_pred_test))




Balanced Random Forest Training Accuracy: 0.9460815047021943
Balanced Random Forest Testing Accuracy: 0.9448621553884712

Balanced Random Forest Classification Report:
              precision    recall  f1-score   support

       False       0.99      0.95      0.97       379
        True       0.47      0.90      0.62        20

    accuracy                           0.94       399
   macro avg       0.73      0.92      0.80       399
weighted avg       0.97      0.94      0.95       399

Fitting 5 folds for each of 100 candidates, totalling 500 fits
Best parameters found for Random Forest: {'n_estimators': 200, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_depth': 20}
Best cross-validation score for Random Forest: 0.9868338557993731
Cross-validation scores: [0.98746082 0.98432602 0.99059561 0.98119122 0.99059561]
Mean cross-validation score: 0.9868338557993731
Random Forest Training Accuracy: 1.0
Random Forest Testing Accuracy: 0.9774436090225563

Random Forest Classification R

# Detailed Explanation of Results and Suggestions for Further Steps

## Balanced Random Forest Results

### Training Accuracy: 0.946
- The model performs very well on the training data with an accuracy of 94.6%.

### Testing Accuracy: 0.945
- The model generalizes well to the test data, achieving a high accuracy of 94.5%.

### Classification Report

#### False (Class 0)
- **Precision:** 0.99 - When the model predicts False, it is correct 99% of the time.
- **Recall:** 0.95 - The model correctly identifies 95% of all False instances.
- **F1-Score:** 0.97 - The harmonic mean of precision and recall, indicating excellent performance for False predictions.

#### True (Class 1)
- **Precision:** 0.47 - When the model predicts True, it is correct 47% of the time.
- **Recall:** 0.90 - The model correctly identifies 90% of all True instances.
- **F1-Score:** 0.62 - The harmonic mean of precision and recall, showing that while the model often catches True instances, it also has many false positives.

### Overall
- **Accuracy:** 0.94 - The overall accuracy across both classes is 94%.
- **Macro Avg:** 0.73 precision, 0.92 recall, 0.80 f1-score - These are unweighted averages, providing an overall measure of performance.
- **Weighted Avg:** 0.97 precision, 0.94 recall, 0.95 f1-score - These are weighted averages, taking into account the imbalance in the dataset.

## Random Forest Results

### Best Parameters from Random Search
- **n_estimators:** 200
- **min_samples_split:** 2
- **min_samples_leaf:** 1
- **max_depth:** 20

### Cross-Validation Score: 0.987
- The mean cross-validation score is 98.7%, indicating that the model performs very well on different subsets of the training data.

### Training Accuracy: 1.0
- The model perfectly fits the training data, with an accuracy of 100%.

### Testing Accuracy: 0.977
- The model generalizes extremely well to the test data, achieving an accuracy of 97.7%.

### Classification Report

#### False (Class 0)
- **Precision:** 0.98 - When the model predicts False, it is correct 98% of the time.
- **Recall:** 0.99 - The model correctly identifies 99% of all False instances.
- **F1-Score:** 0.99 - The harmonic mean of precision and recall, indicating excellent performance for False predictions.

#### True (Class 1)
- **Precision:** 0.82 - When the model predicts True, it is correct 82% of the time.
- **Recall:** 0.70 - The model correctly identifies 70% of all True instances.
- **F1-Score:** 0.76 - The harmonic mean of precision and recall, indicating good performance but with room for improvement.

### Overall
- **Accuracy:** 0.98 - The overall accuracy across both classes is 97.7%.
- **Macro Avg:** 0.90 precision, 0.85 recall, 0.87 f1-score - These are unweighted averages, providing an overall measure of performance.
- **Weighted Avg:** 0.98 precision, 0.98 recall, 0.98 f1-score - These are weighted averages, taking into account the imbalance in the dataset.

## Analysis and Next Steps

### Imbalance Handling
- The Balanced Random Forest classifier improves recall for the minority class (True), but precision is relatively low. This suggests the model is identifying most True instances but also generating false positives.
- Consider further techniques like SMOTE for oversampling the minority class or using ensemble methods that handle imbalance more effectively.

### Feature Engineering
- Explore adding more features, such as other technical indicators (e.g., RSI, MACD) or fundamental indicators (e.g., financial ratios).
- Feature selection techniques can help identify the most important features.

### Hyperparameter Tuning
- The Random Search has provided good results. For further improvement, consider using Bayesian Optimization, which can be more efficient and provide better hyperparameter values.
- Explore other models like XGBoost, LightGBM, or CatBoost, which often perform well with tabular data.

### Model Evaluation
- Implement cross-validation more extensively to ensure the model generalizes well.
- Evaluate the model's performance over different time periods to ensure stability and robustness.

- **Evaluate feature importance** to understand which features contribute most to the model's predictions. This can guide further feature engineering and selection.

### Implementation of Advanced Techniques
- **Model Ensembling**: Combine predictions from multiple models to improve overall performance. Techniques like stacking, bagging, and boosting can be useful.
- **Anomaly Detection**: Identify and treat outliers in the dataset, which can significantly impact model performance.
- **Temporal Cross-Validation**: Since financial data is time-series based, use techniques like walk-forward validation to ensure the model is robust over different time periods.

### Monitoring and Maintenance
- **Performance Monitoring**: Continuously monitor the model’s performance in a live environment to detect any degradation over time.
- **Periodic Retraining**: Financial markets change, so regularly retrain the model with new data to maintain performance.
- **Backtesting**: Simulate the model’s performance on historical data to understand potential returns and risks.

### Conclusion
Both the Balanced Random Forest and the tuned Random Forest models show promising results, with the latter achieving slightly better performance metrics. However, each has its strengths and weaknesses:
- The Balanced Random Forest is better at detecting True instances but generates more false positives.
- The tuned Random Forest offers higher precision and overall accuracy but might overfit the training data.

Combining these approaches and further refining the models through advanced techniques and thorough evaluation will help in building a robust predictive model for financial data.

### Next Steps
1. **Imbalance Handling**: Experiment with different techniques like SMOTE, ADASYN, or ensemble methods specifically designed for imbalanced data.
2. **Feature Engineering**: Add and test new features, and use techniques like PCA or Lasso for feature selection.
3. **Hyperparameter Tuning**: Use Bayesian Optimization or Genetic Algorithms for more efficient hyperparameter tuning.
4. **Model Evaluation**: Implement temporal cross-validation and conduct extensive backtesting to ensure model stability.
5. **Model Ensembling**: Explore stacking, bagging, and boosting to combine multiple models for improved performance.
6. **Deployment**: Develop a framework for continuous monitoring and periodic retraining to adapt to new data and market conditions.

By following these steps, you can build a highly accurate and robust predictive model suitable for financial trading strategies.


In [92]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, f_classif
import xgboost as xgb
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer

# Assuming close_data and signal are already defined

# Step 1: Calculate volatility
close_data['volatility_ig'] = close_data['us_ig_er_index'].pct_change().rolling(window=20).std()
close_data['volatility_hy'] = close_data['us_hy_er_index'].pct_change().rolling(window=20).std()

# Drop NaN values that result from rolling calculations
close_data = close_data.dropna()

# Ensure 'cad_ig_er_index' is included in the features DataFrame
X = close_data[['us_ig_er_index', 'us_hy_er_index', 'volatility_ig', 'volatility_hy', 'cad_ig_er_index']]
y = signal.loc[close_data.index]  # Ensure the signal matches the filtered data

# Step 2: Imbalance Handling with SMOTE
smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X, y)

# Step 3: Feature Engineering
def add_features(data):
    # RSI calculation
    delta = data['cad_ig_er_index'].diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=14).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=14).mean()
    rs = gain / loss
    data['rsi'] = 100 - (100 / (1 + rs))
    
    # MACD calculation
    ema_12 = data['cad_ig_er_index'].ewm(span=12, adjust=False).mean()
    ema_26 = data['cad_ig_er_index'].ewm(span=26, adjust=False).mean()
    data['macd'] = ema_12 - ema_26
    data['macd_signal'] = data['macd'].ewm(span=9, adjust=False).mean()
    data['macd_diff'] = data['macd'] - data['macd_signal']
    
    # SMA calculation
    data['sma'] = data['cad_ig_er_index'].rolling(window=20).mean()
    
    return data

X_res = add_features(X_res)

# Step 4: Hyperparameter Tuning with Bayesian Optimization
xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')

bayes_search = BayesSearchCV(
    estimator=xgb_model,
    search_spaces={
        'learning_rate': Real(0.01, 1.0, prior='log-uniform'),
        'max_depth': Integer(1, 30),
        'n_estimators': Integer(50, 500),
        'subsample': Real(0.1, 1.0, prior='uniform'),
        'colsample_bytree': Real(0.1, 1.0, prior='uniform'),
    },
    n_iter=50,
    cv=5,
    n_jobs=-1,
    random_state=42
)

# Fit Bayesian Optimization
bayes_search.fit(X_res, y_res)

# Best parameters
print(f"Best parameters found: {bayes_search.best_params_}")
print(f"Best cross-validation score: {bayes_search.best_score_}")

# Step 5: Model Evaluation with Cross-Validation
best_xgb_model = bayes_search.best_estimator_
cross_val_scores = cross_val_score(best_xgb_model, X_res, y_res, cv=5)
print(f"Cross-validation scores: {cross_val_scores}")
print(f"Mean cross-validation score: {cross_val_scores.mean()}")

# Final model fitting
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.2, random_state=42)
best_xgb_model.fit(X_train, y_train)

# Model performance on test set
y_pred = best_xgb_model.predict(X_test)
print("\nXGBoost Model Performance:")
print(f"Training Accuracy: {best_xgb_model.score(X_train, y_train)}")
print(f"Testing Accuracy: {best_xgb_model.score(X_test, y_test)}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))




Best parameters found: OrderedDict([('colsample_bytree', 0.1), ('learning_rate', 0.01), ('max_depth', 30), ('n_estimators', 500), ('subsample', 0.6691555385512675)])
Best cross-validation score: 0.9574103323228602
Cross-validation scores: [0.92420213 0.99201065 0.93608522 0.94806924 0.98668442]
Mean cross-validation score: 0.9574103323228602

XGBoost Model Performance:
Training Accuracy: 0.9946737683089214
Testing Accuracy: 0.9813829787234043

Classification Report:
              precision    recall  f1-score   support

       False       1.00      0.97      0.98       397
        True       0.96      1.00      0.98       355

    accuracy                           0.98       752
   macro avg       0.98      0.98      0.98       752
weighted avg       0.98      0.98      0.98       752



In [102]:
# Now proceed with backtesting using vectorbt

# 1. Signal Generation using trained model
X_test_orig = add_features(X)  # Add features to the original dataset
y_pred_signal = best_xgb_model.predict(X_test_orig)

# Ensure the signal is boolean
signal_ml = pd.Series(y_pred_signal, index=X_test_orig.index).astype(bool)

# Reindex the signals to match the close_data index
signal_ml = signal_ml.reindex(close_data.index, method='ffill').fillna(False)

# 2. Portfolio Creation using the machine learning signal
portfolio_ml = vbt.Portfolio.from_signals(
    close=close_data['cad_ig_er_index'],
    entries=signal_ml.shift(1).fillna(False),  # Enter the day after the signal
    exits=signal_ml.shift(1).fillna(False) & (~signal_ml).fillna(False),  # Exit when signal changes from True to False
    init_cash=100000,
    fees=0,
    freq='D'
)

# 3. Generate and print portfolio stats for the ML-based strategy
try:
    stats_ml = portfolio_ml.stats()
    print("\nMachine Learning Strategy Portfolio Statistics:")
    print(stats_ml)
except Exception as e:
    print(f"An error occurred while generating ML portfolio stats: {str(e)}")




Machine Learning Strategy Portfolio Statistics:
Start                         2002-11-29 00:00:00
End                           2024-06-28 00:00:00
Period                         2033 days 00:00:00
Start Value                              100000.0
End Value                           122698.035069
Total Return [%]                        22.698035
Benchmark Return [%]                    37.286799
Max Gross Exposure [%]                      100.0
Total Fees Paid                               0.0
Max Drawdown [%]                        10.598642
Max Drawdown Duration           296 days 00:00:00
Total Trades                                    1
Total Closed Trades                             0
Total Open Trades                               1
Open Trade PnL                       22698.035069
Win Rate [%]                                  NaN
Best Trade [%]                                NaN
Worst Trade [%]                               NaN
Avg Winning Trade [%]                         NaN
A