# XGBoost with Cross Validation and PCA

In [None]:
import numpy as np
import pandas as pd
from numba import njit
import vectorbtpro as vbt
vbt.settings.set_theme("dark")
vbt.settings.plotting["layout"]["width"] = 800
vbt.settings.plotting['layout']['height'] = 200
import warnings
warnings.filterwarnings("ignore")
import matplotlib.pyplot as plt

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import Ridge, Lasso, ElasticNet, LinearRegression, LogisticRegression
from sklearn.svm import SVR, SVC
from xgboost import XGBRegressor, XGBClassifier
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(random_state=42) # random forest classifier
from joblib import dump, load


### Modeling
The class Splitter can also be helpful in cross-validating ML models. In particular, you can casually step upon a class SKLSplitter that acts as a regular cross-validator from scikit-learn by subclassing BaseCrossValidator. We'll demonstrate its usage on a simple classification problem of predicting the best entry and exit timings.

Before we start, we need to decide on features and labels that should act as predictor and response variables respectively. Features are usually multi-columnar time-series DataFrames where each row contains multiple data points (one per column) that should predict the same row in labels. Labels are usually a single-columnar time-series Series that should be predicted. Ask yourself the following questions to easily come up with a decision:

"How can the future performance be represented, preferably as a single number? Should it be the price at the next bar, the average price change over the next week, a vector of weights for rebalancing, a boolean containing a signal, or something else?"
"What kind of data that encompasses the past performance is likely to predict the future performance? Should it be indicators, news sentiment index, past backtesting results, or something else?"
"Which ML model can handle such a task?" (remember that most models are limited to just a couple of specific feature and label formats!)
For the sake of an example, we'll fit a random forest classifier on all TA-Lib indicators stacked along columns to predict the binary labels generated by the label generator TRENDLB, where 1 means an uptrend and 0 means a downtrend. Sounds like fun 😌

Build a pipeline to impute and (standard-)normalize the data, [reduce the dimensionality](https://scikit-learn.org/stable/auto_examples/compose/plot_digits_pipe.html) of the features, as well as fit one of the [linear](https://scikit-learn.org/stable/modules/linear_model.html) models to predict the average price change over the next n bars (i.e., regression task!). Based on each prediction, you can then decide whether a position is worth opening or closing out. 

# Let's experiment

Smaller resolution but using dollar bars

# Helper functions
Create dollar bars and add them to the original df

In [None]:

def dollar_bar_func(ohlc_df, dollar_bar_size):
    # Calculate dollar value traded for each row
    ohlc_df['DollarValue'] = ohlc_df['Close'] * ohlc_df['Volume']
    
    # Calculate cumulative dollar value
    ohlc_df['CumulativeDollarValue'] = ohlc_df['DollarValue'].cumsum()
    
    # Determine the number of dollar bars
    num_bars = int(ohlc_df['CumulativeDollarValue'].iloc[-1] / dollar_bar_size)
    
    # Generate index positions for dollar bars
    bar_indices = [0]
    cumulative_value = 0
    for i in range(1, len(ohlc_df)):
        cumulative_value += ohlc_df['DollarValue'].iloc[i]
        if cumulative_value >= dollar_bar_size:
            bar_indices.append(i)
            cumulative_value = 0
    
    # Create a new dataframe with dollar bars
    dollar_bars = []
    for i in range(len(bar_indices) - 1):
        start_idx = bar_indices[i]
        end_idx = bar_indices[i + 1]
        
        dollar_bar = {
            'Open': ohlc_df['Open'].iloc[start_idx],
            'High': ohlc_df['High'].iloc[start_idx:end_idx].max(),
            'Low': ohlc_df['Low'].iloc[start_idx:end_idx].min(),
            'Close': ohlc_df['Close'].iloc[end_idx],
            'Volume': ohlc_df['Volume'].iloc[start_idx:end_idx].sum(),
            'Quote volume': ohlc_df['Quote volume'].iloc[start_idx:end_idx].sum(),
            'Trade count': ohlc_df['Trade count'].iloc[start_idx:end_idx].sum(),
            'Taker base volume': ohlc_df['Taker base volume'].iloc[start_idx:end_idx].sum(),
            'Taker quote volume': ohlc_df['Taker quote volume'].iloc[start_idx:end_idx].sum()
        }
        
        if isinstance(ohlc_df.index, pd.DatetimeIndex):
            dollar_bar['Open Time'] = ohlc_df.index[start_idx]
            dollar_bar['Close Time'] = ohlc_df.index[end_idx] - pd.Timedelta(milliseconds=1)
        elif 'Open Time' in ohlc_df.columns:
            dollar_bar['Open Time'] = ohlc_df['Open Time'].iloc[start_idx]
            dollar_bar['Close Time'] = ohlc_df['Open Time'].iloc[end_idx] - pd.Timedelta(milliseconds=1)
        
        dollar_bars.append(dollar_bar)
    
    dollar_bars_df = pd.concat([pd.DataFrame([bar]) for bar in dollar_bars], ignore_index=True)
    
    return dollar_bars_df

# Create a simple function to simplify the number so we can use it in our column names
def simplify_number(num):
    """
    Simplifies a large number by converting it to a shorter representation with a suffix (K, M, B).
    simplify_number(1000) -> 1K
    """
    suffixes = ['', 'K', 'M', 'B']
    suffix_index = 0

    while abs(num) >= 1000 and suffix_index < len(suffixes) - 1:
        num /= 1000.0
        suffix_index += 1

    suffix = suffixes[suffix_index] if suffix_index > 0 else ''
    simplified_num = f'{int(num)}{suffix}'

    return simplified_num

def merge_and_fill_dollar_bars(original_df, dollar_bars_df, dollar_bar_size):
    # Add prefix to column names in dollar bars dataframe
    dollar_bar_prefix = f'db_{simplify_number(dollar_bar_size)}_'
    dollar_bars_df_renamed = dollar_bars_df.add_prefix(dollar_bar_prefix)

    # Convert 'Open Time' columns to pandas datetime format and set them as index
    dollar_bars_df_renamed.index = pd.to_datetime(dollar_bars_df_renamed[dollar_bar_prefix + 'Open Time'])

    # Merge the dataframes on the index
    merged_df = original_df.merge(dollar_bars_df_renamed, how='left', left_index=True, right_index=True)

    # Set the flag for a new dollar bar with prefix
    merged_df[dollar_bar_prefix + 'NewDBFlag'] = ~merged_df[dollar_bar_prefix + 'Close'].isna()

    # Forward fill the NaN values for all columns except the new dollar bar flag
    columns_to_ffill = [col for col in merged_df.columns if col != dollar_bar_prefix + 'NewDBFlag']
    merged_df[columns_to_ffill] = merged_df[columns_to_ffill].fillna(method='ffill')

    # Fill the remaining NaN values in the new dollar bar flag column with False
    merged_df[dollar_bar_prefix + 'NewDBFlag'] = merged_df[dollar_bar_prefix + 'NewDBFlag'].fillna(False)
    
    # Assign the renamed 'Open Time' column back to the dataframe
    merged_df[dollar_bar_prefix + 'Open Time'] = merged_df[dollar_bar_prefix + 'Open Time']

    return merged_df





# Calculate Dollar Bars
Calc Dollar bars and then add technical analysis features

Uncomment this section if you want to run different size dollar bars

In [None]:
# dollar_bar_size = 90_000_000
# btc_dollar_bars = dollar_bar_func(futures_1m.get(), dollar_bar_size=dollar_bar_size)
# btc_dollar_bars.index = pd.to_datetime(btc_dollar_bars['Open Time'])
# btc_dollar_bars.shape

In [None]:
# Convert the dataframe back into a vbt data object
# btc_90M_db_vbt = vbt.BinanceData.from_data(btc_dollar_bars)


In [None]:
# Save the dollarbars to a pickle file
# btc_90M_db_vbt.save('btc_90M_db_vbt.pkl')

# Load the dollar bars from pickle file

In [None]:
btc_90M_db_vbt = vbt.BinanceData.load('data/btc_90M_db_vbt.pkl')

Take a small slice of the data for train/testing and leave some to be out of sample

In [None]:
data = btc_90M_db_vbt['2021-01-01':'2023-01-01']
outofsample_data = btc_90M_db_vbt['2023-01-01':'2023-06-03']
print(data.shape)
print(outofsample_data.shape)

# Generate features for the model
Note I originally tried adding all of the talib indicators with default params to the model this was pretty good on daily data but on dollar bars it seemed to confuse the model. In the below we are simply adding the trend binary classifier and the calendar features.

In [None]:
# import talib
# print(talib.get_function_groups())
# # Not sure how to call just a single indicator group so I'll just call all of them
# vbt.IF.list_indicators("psar")

In [None]:

n = 150 # number of periods in the future to predict

# Generate the features (X) using TA-Lib indicators
# X = data.run("talib", periods=vbt.run_func_dict(mavp=n))
X = data.get()
# add trend label as a feature if the market is up 20% it is in a bullish trend and if it is down 5% it is in a bearish trend
# Read more about the pivotinfo below in the next couple cells
pivot_info = data.run("pivotinfo", up_th=.30, down_th=0.05)
binary_pivot_labels = np.where(data.close > pivot_info.conf_value,1,0) # Create binary labels for pivot points
X['trend'] = binary_pivot_labels # add pivot label as a feature
# Add time features
X['dayofmonth'] = X.index.day
X['month'] = X.index.month
X['year'] = X.index.year
X['hour'] = X.index.hour
X['minute'] = X.index.minute
X['dayofweek'] = X.index.dayofweek
X['dayofyear'] = X.index.dayofyear

# Now we are trying to generate future price predictions so we will set the y labels to the price change n periods in the future
y = (data.close.shift(-n) / data.close - 1).rolling(n).mean() # future price change we use rolling mean to smooth the data

# Preprocessing steps to handle NaNs
X = X.replace([-np.inf, np.inf], np.nan) # replace inf with nan
invalid_column_mask = X.isnull().all(axis=0) | (X.nunique() == 1) # drop columns that are all nan or have only one unique value
X = X.loc[:, ~invalid_column_mask] # drop invalid columns
invalid_row_mask = X.isnull().any(axis=1) | y.isnull() # drop rows that have nan in any column or in y

# Drop invalid rows in X and y
X = X.loc[~invalid_row_mask]
y = y.loc[~invalid_row_mask]
# Drop Open time and close time
X = X.drop(['Open Time','Close Time'], axis=1)

Previously I used the trendlb with great results, but unfortunately it has a look ahead bias DUH no wonder it had such great results.

## A replacement for TRENDLB - PIVOTINFO
VBT has a pivotinfo method that avoids lookahead bias. Below we plot the pivot points and we create a similar binary version like trendlb

IMPORTANT: Clear the plot outputs before pushing to Github

In [None]:

fig = data.plot(plot_volume=False)
trendwithlookahead = data.run('trendlb', 0.20, 0.05, mode='binary').labels
trendwithlookahead.rename('trendlb', inplace=True)
pivot_info = data.run("pivotinfo", up_th=.20, down_th=0.05)
binary_pivot_labels = np.where(data.close > pivot_info.conf_value,1,0) # Create binary labels for pivot points
print('Trend with look ahead bias')
data.close.vbt.overlay_with_heatmap(trendwithlookahead).show() # plot the trend labels
print('Using pivot points no look ahead bias')
data.close.vbt.overlay_with_heatmap(binary_pivot_labels).show() # plot the pivot labels
print('Here are the actual pivot points')
pivot_info.plot(fig=fig, conf_value_trace_kwargs=dict(visible=False))
fig.show()

In [None]:
del fig # delete the figure to free up memory

### We now need to tinker with these params
up_th and down_th to see if we can get the optimal params. but for now I'm just going to use the same ones that I used for trendlb

In [None]:

# Construct the pipeline
steps = [
    ('imputation', SimpleImputer(strategy='mean')),  # Imputation replaces missing values
    ('scaler', StandardScaler()),  # StandardScaler normalizes the data
    ('pca', PCA(n_components=15)),  # PCA reduces dimensionality
    
    # Choose one of the following models
    # ('model', Ridge())  # Ridge regression is used as the prediction model
    # ('model', LinearRegression())  # Linear regression is used as the prediction model
    # ('model', LogisticRegression())  # Logistic regression is used as the prediction model
    # ('model', Lasso())  # Lasso regression is used as the prediction model
    # ('model', ElasticNet())  # ElasticNet regression is used as the prediction model
    # ('model', SVR())  # Support Vector Regression is used as the prediction model
    ('model', XGBRegressor(objective='reg:squarederror'))  # XGBoost regression is used as the prediction model
]
pipeline = Pipeline(steps)

# Cross-validate Creates a cross-validation object with all the indexes for each cv split
cv = vbt.SKLSplitter(
    "from_expanding",
    min_length=600,
    offset=200,
    split=-200,
    set_labels=["train", "test"]
)

cv_splitter = cv.get_splitter(X)
# Plot the cross-validation splits
# cv_splitter.plot().show_svg()


drop the datetime columns the ML model doesn't like them. It prefers numbers

In [None]:
X.columns

Run some predictions. NOTE: if you want to skip ahead to save time on your machine, just scroll down to where we load the model from disk. I've already trained a model.

In [None]:
# # Use your pipeline to compress features and fit the model for predictions
# print(f'Pipeline Steps :{pipeline.steps}')
# pipeline.fit(X, y)  # Fit the pipeline on the entire dataset    
# print(f'Pipeline Score :{pipeline.score(X, y)}')  # Score the pipeline on the entire dataset of training data
# scores = cross_val_score(pipeline, X, y, cv=cv, scoring="r2", n_jobs=-1, verbose=100) # how well the model generalizes to unseen data
# average_score = np.mean(scores)
# print(f'Average cross-validation score: {average_score}')


## Run our cross validation
Again to skip ahead just scroll down to load the model

In [None]:


# Predictions
X_slices = cv_splitter.take(X)
y_slices = cv_splitter.take(y)

test_labels = []
test_preds = []
for split in X_slices.index.unique(level="split"):  
    X_train_slice = X_slices[(split, "train")]  
    y_train_slice = y_slices[(split, "train")]
    X_test_slice = X_slices[(split, "test")]
    y_test_slice = y_slices[(split, "test")]
    slice_pipeline = pipeline.fit(X_train_slice, y_train_slice)  
    test_pred = slice_pipeline.predict(X_test_slice)  
    test_pred = pd.Series(test_pred, index=y_test_slice.index)
    test_labels.append(y_test_slice)
    test_preds.append(test_pred)
    print(f"Split {split} R-squared: {r2_score(y_test_slice, test_pred)}")

test_labels = pd.concat(test_labels).rename("labels")  
test_preds = pd.concat(test_preds).rename("preds")

# Show the accuracy of the predictions
# Assuming test_labels and test_preds are your true and predicted values
mse = mean_squared_error(test_labels, test_preds)
rmse = np.sqrt(mse)  # or use mean_squared_error with squared=False
mae = mean_absolute_error(test_labels, test_preds)
r2 = r2_score(test_labels, test_preds)

print(f"Mean Squared Error (MSE): {mse}")
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"Mean Absolute Error (MAE): {mae}")
print(f"R-squared: {r2}")

# Visualize the predictions as a heatmap plotted against the price
# data.close.vbt.overlay_with_heatmap(test_preds).show_svg()

### Save the model
this just saves the last cross validation model. Uncomment the below if you ran a model and want to save it.

In [None]:

filename = 'models/model_upto_2023_with_pivot.joblib'
dump(slice_pipeline, filename)

# Load the model from storage

In [None]:
filename = 'models/model_upto_2023_with_pivot.joblib'
# Load the model from the .joblib file
final_pipeline = load(filename) 

# Make predictions on the entire dataset
insample_predictions = final_pipeline.predict(X)

# Calculate the R-squared score on the entire dataset
r2 = r2_score(y, insample_predictions)

print(f"R-squared on the entire dataset: {r2}")


In [None]:
# Visualize the predictions versus the actuals
# yoos is the actuals and outofsample_predictions is the predictions
plt.scatter(test_labels, test_preds, alpha=0.2)
# Add a line of best fit
m, b = np.polyfit(test_labels, test_preds, 1)
plt.plot(y, m*y + b, color='red')

# Add the formula for the slope and intercept
plt.text(0.05, 0.95, f"y = {m:.2f}x + {b:.2f}", transform=plt.gca().transAxes)

# Add the y and x axis labels
plt.xlabel("Actuals")
plt.ylabel("Predictions")

Trained and test data

In [None]:
# Visualize the predictions versus the actuals
# yoos is the actuals and outofsample_predictions is the predictions
plt.scatter(y, insample_predictions, alpha=0.2)
# Add a line of best fit
m, b = np.polyfit(y, insample_predictions, 1)
plt.plot(y, m*y + b, color='red')

# Add the formula for the slope and intercept
plt.text(0.05, 0.95, f"y = {m:.2f}x + {b:.2f}", transform=plt.gca().transAxes)

# Add the y and x axis labels
plt.xlabel("Actuals")
plt.ylabel("Predictions")

# Test it on out of sample data
2023 Was never trained

In [None]:

n = 150 # number of periods in the future to predict

# Generate the features (X) using TA-Lib indicators
# X = data.run("talib", periods=vbt.run_func_dict(mavp=n))
Xoos = outofsample_data.get()
# psar_vbt = outofsample_data.run("pandas_ta:PSAR", append=True, acceleration=0.02, maximum=0.2) # I didn't end up using this
# add trend label as a feature
# Read more about the pivotinfo below in the next couple cells
pivot_info = outofsample_data.run("pivotinfo", up_th=.30, down_th=0.05)
binary_pivot_labels = np.where(outofsample_data.close > pivot_info.conf_value,1,0) # Create binary labels for pivot points
Xoos['trend'] = binary_pivot_labels # add pivot label as a feature
# X['psar_cross'] = psar_vbt.psarr
# Drop the time columns
# Drop Open time and close time
Xoos = Xoos.drop(['Open Time','Close Time'], axis=1)
# Add time features
Xoos['dayofmonth']  = Xoos.index.day
Xoos['month']       = Xoos.index.month
Xoos['year']        = Xoos.index.year
Xoos['hour']        = Xoos.index.hour
Xoos['minute']      = Xoos.index.minute
Xoos['dayofweek']   = Xoos.index.dayofweek
Xoos['dayofyear']   = Xoos.index.dayofyear

# Now we are trying to generate future price predictions so we will set the y labels to the price change n periods in the future
yoos = (outofsample_data.close.shift(-n) / outofsample_data.close - 1).rolling(n).mean() # future price change we use rolling mean to smooth the data

# Preprocessing steps to handle NaNs
Xoos = Xoos.replace([-np.inf, np.inf], np.nan) # replace inf with nan
invalid_column_mask = Xoos.isnull().all(axis=0) #| (Xoos.nunique() == 1) # removed the second condition because `year` column is always the same for 2023
Xoos = Xoos.loc[:, ~invalid_column_mask] # drop invalid columns
invalid_row_mask = Xoos.isnull().any(axis=1) | yoos.isnull() # drop rows that have nan in any column or in y

# Drop invalid rows in X and y
Xoos = Xoos.loc[~invalid_row_mask]
yoos = yoos.loc[~invalid_row_mask]

In [None]:
print(yoos.shape)
print(Xoos.shape)

### Test the model on data it has never seen

In [None]:
# Make predictions on the entire dataset
outofsample_predictions = final_pipeline.predict(Xoos)

# Calculate the R-squared score on the entire dataset
r2 = r2_score(yoos, outofsample_predictions)

print(f"R-squared on the out of sample dataset: {r2}")

### Create a scatterplot of the predictions vs the actuals

In [None]:
import matplotlib.pyplot as plt

# Visualize the predictions versus the actuals
# yoos is the actuals and outofsample_predictions is the predictions
plt.scatter(yoos, outofsample_predictions)
# Add a line of best fit
m, b = np.polyfit(yoos, outofsample_predictions, 1)
plt.plot(yoos, m*yoos + b, color='red')

# Add the formula for the slope and intercept
plt.text(0.05, 0.95, f"y = {m:.2f}x + {b:.2f}", transform=plt.gca().transAxes)

# Add the y and x axis labels
plt.xlabel("Actuals")
plt.ylabel("Predictions")


In [None]:
# create a pandas dataframe or series with the predictions and the index from the out of sample data
outofsample_predictions = pd.Series(outofsample_predictions, index=yoos.index)
outofsample_predictions = outofsample_predictions.rename("outofsample_predictions")
outofsample_predictions

# Emulate a production scenario on out of sample data
## Retrain the model every 200 bars

In [None]:
# Load the model
# Preprocess the data
# Create Cross Validations for training and testing on newly seen data
# Train the model
# Make predictions
# Test and evaluate the model



In [None]:
filename = 'models/model_upto_2023_with_pivot.joblib'
# Load the model from the .joblib file
final_pipeline = load(filename) 

# Make predictions on the entire dataset
insample_predictions = final_pipeline.predict(X)

# Calculate the R-squared score on the entire dataset
r2 = r2_score(y, insample_predictions)

print(f"R-squared on the entire dataset: {r2}")

In [None]:

n = 150 # number of periods in the future to predict

# Generate the features (X) using TA-Lib indicators
# X = data.run("talib", periods=vbt.run_func_dict(mavp=n))
Xoos = outofsample_data.get()
# psar_vbt = outofsample_data.run("pandas_ta:PSAR", append=True, acceleration=0.02, maximum=0.2) # I didn't end up using this
# add trend label as a feature
# Read more about the pivotinfo below in the next couple cells
pivot_info = outofsample_data.run("pivotinfo", up_th=.30, down_th=0.05)
binary_pivot_labels = np.where(outofsample_data.close > pivot_info.conf_value,1,0) # Create binary labels for pivot points
Xoos['trend'] = binary_pivot_labels # add pivot label as a feature
# X['psar_cross'] = psar_vbt.psarr
# Drop the time columns
# Drop Open time and close time
Xoos = Xoos.drop(['Open Time','Close Time'], axis=1)
# Add time features
Xoos['dayofmonth']  = Xoos.index.day
Xoos['month']       = Xoos.index.month
Xoos['year']        = Xoos.index.year
Xoos['hour']        = Xoos.index.hour
Xoos['minute']      = Xoos.index.minute
Xoos['dayofweek']   = Xoos.index.dayofweek
Xoos['dayofyear']   = Xoos.index.dayofyear

# Now we are trying to generate future price predictions so we will set the y labels to the price change n periods in the future
yoos = (outofsample_data.close.shift(-n) / outofsample_data.close - 1).rolling(n).mean() # future price change we use rolling mean to smooth the data

# Preprocessing steps to handle NaNs
Xoos = Xoos.replace([-np.inf, np.inf], np.nan) # replace inf with nan
invalid_column_mask = Xoos.isnull().all(axis=0) #| (Xoos.nunique() == 1) # removed the second condition because `year` column is always the same for 2023
Xoos = Xoos.loc[:, ~invalid_column_mask] # drop invalid columns
invalid_row_mask = Xoos.isnull().any(axis=1) | yoos.isnull() # drop rows that have nan in any column or in y

# Drop invalid rows in X and y
Xoos = Xoos.loc[~invalid_row_mask]
yoos = yoos.loc[~invalid_row_mask]

In [None]:

# Cross-validate Creates a cross-validation object with all the indexes for each cv split
cv = vbt.SKLSplitter(
    "from_expanding",
    min_length=600,
    offset=200,
    split=-200,
    set_labels=["train", "test"]
)

cv_splitter = cv.get_splitter(Xoos)

# Predictions
X_slices = cv_splitter.take(Xoos)
y_slices = cv_splitter.take(yoos)

test_labels = []
test_preds = []
for split in X_slices.index.unique(level="split"):  
    X_train_slice = X_slices[(split, "train")]  
    y_train_slice = y_slices[(split, "train")]
    X_test_slice = X_slices[(split, "test")]
    y_test_slice = y_slices[(split, "test")]
    slice_pipeline = pipeline.fit(X_train_slice, y_train_slice)  
    test_pred = slice_pipeline.predict(X_test_slice)  
    test_pred = pd.Series(test_pred, index=y_test_slice.index)
    test_labels.append(y_test_slice)
    test_preds.append(test_pred)
    print(f"Split {split} R-squared: {r2_score(y_test_slice, test_pred)}")

oos_test_labels = pd.concat(test_labels).rename("labels")  
oos_test_preds = pd.concat(test_preds).rename("preds")

# Show the accuracy of the predictions
# Assuming test_labels and test_preds are your true and predicted values
mse = mean_squared_error(test_labels, test_preds)
rmse = np.sqrt(mse)  # or use mean_squared_error with squared=False
mae = mean_absolute_error(test_labels, test_preds)
r2 = r2_score(test_labels, test_preds)

print(f"Mean Squared Error (MSE): {mse}")
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"Mean Absolute Error (MAE): {mae}")
print(f"R-squared: {r2}")

# Visualize the predictions as a heatmap plotted against the price
# data.close.vbt.overlay_with_heatmap(test_preds).show_svg()

### Simulate a portfolio in 2023 with retraining the model every 200 bars

In [None]:
oos_retraining_pf = vbt.Portfolio.from_signals(
    outofsample_data.close[oos_test_preds.index], # use only the test set
    entries         = oos_test_preds > 0.05, # long when probability of price increase is greater than 2%
    exits           = oos_test_preds < 0.00, # long when probability of price increase is greater than 2%
    short_entries   = oos_test_preds < -0.04, # long when probability of price increase is greater than 2%
    short_exits     = oos_test_preds > 0.0, # short when probability prediction is less than -5%
    # direction="both" # long and short
)
print(oos_retraining_pf.stats())


In [None]:
oos_retraining_pf.plot().show()

Show that same model without retraining the model every 200 dollar bars

In [None]:
# oos_pf = vbt.Portfolio.from_signals(
#     outofsample_data.close[outofsample_predictions.index], # use only the test set
#     entries         = outofsample_predictions > 0.05, # long when probability of price increase is greater than 2%
#     exits           = outofsample_predictions < 0.00, # long when probability of price increase is greater than 2%
#     short_entries   = outofsample_predictions < -0.04, # long when probability of price increase is greater than 2%
#     short_exits     = outofsample_predictions > 0.0, # short when probability prediction is less than -5%
#     # direction="both" # long and short
# )
# print(oos_pf.stats())
# oos_pf.plot().show_svg()


let's compare it to a long only strategy

In [None]:
# For comparison run a buy and hold strategy
buy_and_hold = vbt.Portfolio.from_holding(outofsample_data.close[outofsample_predictions.index])
print(f'Total return: {buy_and_hold.total_return}')
print(f'Max Drawdown: {buy_and_hold.max_drawdown}')


👆 better drawdowns than a buy and hold, and almost the same results.

# Look at the Portfolio on Test Data

and Simulate a portfolio?

In [None]:
insample_pf = vbt.Portfolio.from_signals(
    data.close[test_preds.index],  # use only the test set
    entries         = test_preds > 0.05,  # long when probability of price increase is greater than 2%
    exits           = test_preds < 0.00,  # long when probability of price increase is greater than 2%
    short_entries   = test_preds < -0.04,  # long when probability of price increase is greater than 2%
    short_exits     = test_preds > 0.0,  # short when probability prediction is less than -5%
    # direction="both" # long and short
)
print(insample_pf.stats())

# pf.plot().show_svg()
# Show first period
# pf['2018':'2021'].plot().show_svg()
# Show second period
# pf['2021':'2023'].plot().show_svg()


In [None]:
insample_pf.plot().show()

In [None]:
fig = insample_pf.cumulative_returns.vbt.plot(trace_kwargs=dict(name='Insample')) # plot the in sample equity curve from test data not trained data
oos = insample_pf.cumulative_returns[-1] *(1+ oos_retraining_pf.returns).cumprod() # append the out of sample equity curve to the in sample equity curve
# Add the out of sample equity curve to the plot
oos.vbt.plot(fig=fig, trace_kwargs=dict(name='Out of Sample'))
normalized_price = data.close/data.close[0]
oos_normalized_price = outofsample_data.close/outofsample_data.close[0]
normalized_price.rename('Normalized Price').vbt.plot(fig=fig)
oos_normalized_price.rename('Out of Sample Normalized Price').vbt.plot(fig=fig)
# The gap is the warmup period for the new model to start making predictions

In [None]:
trades = insample_pf.trades.records_readable

## Save everything to the models folder for later analysis

In [None]:
insample_pf.save('models/insample_test_portfolio.pkl')
insample_pf.stats().to_csv('models/insample_stats_test.csv')
insample_pf.trades.records_readable.to_csv('models/insample_trades_test.csv')
X.to_csv('models/insample_X_test.csv')
y.to_csv('models/insample_y_test.csv')
oos_pf.save('models/oos_test_portfolio.pkl')
oos_pf.stats().to_csv('models/oos_stats_test.csv')
oos_pf.trades.records_readable.to_csv('models/oos_trades_test.csv')
Xoos.to_csv('models/oos_X_test.csv')
yoos.to_csv('models/oos_y_test.csv')
insample_predictions.to_csv('models/insample_predictions_test.csv')
outofsample_predictions.to_csv('models/oos_predictions_test.csv')


Save the out of sample portfolio that was retrained every 200 bars

In [None]:
oos_retraining_pf.save('models/oos_retrained_portfolio.pkl')
oos_retraining_pf.stats().to_csv('models/oos_retrained_stats.csv')
oos_retraining_pf.trades.records_readable.to_csv('models/oos_retrained_trades.csv')
outofsample_retraining_predictions.to_csv('models/oos_retrained_predictions.csv')



# Explore which features are impacting the model

In [None]:
import matplotlib.pyplot as plt
# Extract the fitted XGBRegressor model from the pipeline
fitted_model = pipeline.named_steps['model']

# Get feature importance
importance = fitted_model.feature_importances_

# Summarize feature importance
for i, j in enumerate(importance):
    print('Feature: %0d, Score: %.5f' % (i,j))

# Plot feature importance
from xgboost import plot_importance
plot_importance(fitted_model)
plt.show()

# Assuming `X` is your feature matrix
feature_names = X.columns.tolist()

# If you use PCA in your pipeline, the output feature names would be the principal components, not the original feature names.
# If that's the case, you should generate new names for the principal components
if 'pca' in pipeline.named_steps:
    n_components = pipeline.named_steps['pca'].n_components_
    feature_names = [f'PC{i+1}' for i in range(n_components)]

# Print feature importance with names
for name, importance in zip(feature_names, fitted_model.feature_importances_):
    print(f'Feature: {name}, Score: {importance}')


A lot to unpack up above. Why are the feature scores so much different than the fscores of the features?

# Hyperparameter Tuning

### Grid Search Method
#### DONT RUN WITHOUT GPU

In [None]:
from sklearn.model_selection import GridSearchCV

# Specify hyperparameters to tune and their respective ranges
param_grid = {
    'model__learning_rate': [0.01, 0.1, 0.2],
    'model__n_estimators': [100, 500, 1000],
    'model__max_depth': [3, 5, 7],
    'model__min_child_weight': [1, 5, 10],
    'model__subsample': [0.5, 0.7, 1.0],
    'model__colsample_bytree': [0.5, 0.7, 1.0]
    # add other parameters here
}

# Perform grid search
grid_search = GridSearchCV(pipeline, param_grid, cv=cv, scoring="r2", n_jobs=-1, verbose=10)
grid_search.fit(X, y)

# Best parameters and score from grid search
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best score: {grid_search.best_score_}")


### Randomized Search

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform, randint

# Specify hyperparameters to tune and their respective distributions
param_dist = {
    'model__learning_rate': uniform(0.01, 0.2),
    'model__n_estimators': randint(100, 1000),
    'model__max_depth': randint(3, 10),
    'model__min_child_weight': randint(1, 10),
    'model__subsample': uniform(0.5, 0.5),
    'model__colsample_bytree': uniform(0.5, 0.5),
    # add other parameters here
}

# Perform randomized search
random_search = RandomizedSearchCV(pipeline, param_dist, n_iter=10, cv=cv, scoring="r2", n_jobs=-1, verbose=10, random_state=42)
random_search.fit(X, y)

# Best parameters and score from random search
print(f"Best parameters: {random_search.best_params_}")
print(f"Best score: {random_search.best_score_}")


In [None]:
# Fit and predict with the best estimator
test_labels = []
test_preds = []
for split in X_slices.index.unique(level="split"):  
    X_train_slice = X_slices[(split, "train")]  
    y_train_slice = y_slices[(split, "train")]
    X_test_slice = X_slices[(split, "test")]
    y_test_slice = y_slices[(split, "test")]

    slice_pipeline = random_search.best_estimator_.fit(X_train_slice, y_train_slice)  
    test_pred = slice_pipeline.predict(X_test_slice)  
    test_pred = pd.Series(test_pred, index=y_test_slice.index)
    test_labels.append(y_test_slice)
    test_preds.append(test_pred)

test_labels = pd.concat(test_labels).rename("labels")  
test_preds = pd.concat(test_preds).rename("preds")

# Show the accuracy of the predictions
# Assuming test_labels and test_preds are your true and predicted values
mse = mean_squared_error(test_labels, test_preds)
rmse = np.sqrt(mse)  # or use mean_squared_error with squared=False
mae = mean_absolute_error(test_labels, test_preds)
r2 = r2_score(test_labels, test_preds)

print(f"Mean Squared Error (MSE): {mse}")
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"Mean Absolute Error (MAE): {mae}")
print(f"R-squared: {r2}")

# Visualize the predictions as a heatmap plotted against the price
data.close.vbt.overlay_with_heatmap(test_preds).show_svg()


In [None]:
pf = vbt.Portfolio.from_signals(
    data.close[test_preds.index], # use only the test set
    test_preds > 0.05, # long when probability of price increase is greater than 2%
    test_preds < 0.02, # short when probability prediction is less than -5%
    direction="LongOnly" # long and short
)
print(pf.stats())
pf.plot().show_svg()


In [None]:
pf.trades.records_readable