# XGBoost with Cross Validation and PCA

In [2]:
import numpy as np
import pandas as pd
from numba import njit
import vectorbtpro as vbt
vbt.settings.set_theme("dark")
vbt.settings.plotting["layout"]["width"] = 800
vbt.settings.plotting['layout']['height'] = 200
import warnings
warnings.filterwarnings("ignore")
import matplotlib.pyplot as plt
from collections import Counter
import pandas_ta as ta

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import Ridge, Lasso, ElasticNet, LinearRegression, LogisticRegression
from sklearn.svm import SVR, SVC
from xgboost import XGBRegressor, XGBClassifier
from xgboost import plot_importance
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(random_state=42) # random forest classifier
from joblib import dump, load
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.cluster import KMeans
from sklearn.model_selection import cross_val_score, KFold
import matplotlib.pyplot as plt


### Modeling
The class Splitter can also be helpful in cross-validating ML models. In particular, you can casually step upon a class SKLSplitter that acts as a regular cross-validator from scikit-learn by subclassing BaseCrossValidator. We'll demonstrate its usage on a simple classification problem of predicting the best entry and exit timings.

Before we start, we need to decide on features and labels that should act as predictor and response variables respectively. Features are usually multi-columnar time-series DataFrames where each row contains multiple data points (one per column) that should predict the same row in labels. Labels are usually a single-columnar time-series Series that should be predicted. Ask yourself the following questions to easily come up with a decision:

"How can the future performance be represented, preferably as a single number? Should it be the price at the next bar, the average price change over the next week, a vector of weights for rebalancing, a boolean containing a signal, or something else?"
"What kind of data that encompasses the past performance is likely to predict the future performance? Should it be indicators, news sentiment index, past backtesting results, or something else?"
"Which ML model can handle such a task?" (remember that most models are limited to just a couple of specific feature and label formats!)
For the sake of an example, we'll fit a random forest classifier on all TA-Lib indicators stacked along columns to predict the binary labels generated by the label generator TRENDLB, where 1 means an uptrend and 0 means a downtrend. Sounds like fun 😌

Build a pipeline to impute and (standard-)normalize the data, [reduce the dimensionality](https://scikit-learn.org/stable/auto_examples/compose/plot_digits_pipe.html) of the features, as well as fit one of the [linear](https://scikit-learn.org/stable/modules/linear_model.html) models to predict the average price change over the next n bars (i.e., regression task!). Based on each prediction, you can then decide whether a position is worth opening or closing out. 

# Helper functions
Create dollar bars and add them to the original df

In [3]:

def dollar_bar_func(ohlc_df, dollar_bar_size):
    # Calculate dollar value traded for each row
    ohlc_df['DollarValue'] = ohlc_df['Close'] * ohlc_df['Volume']
    
    # Calculate cumulative dollar value
    ohlc_df['CumulativeDollarValue'] = ohlc_df['DollarValue'].cumsum()
    
    # Determine the number of dollar bars
    num_bars = int(ohlc_df['CumulativeDollarValue'].iloc[-1] / dollar_bar_size)
    
    # Generate index positions for dollar bars
    bar_indices = [0]
    cumulative_value = 0
    for i in range(1, len(ohlc_df)):
        cumulative_value += ohlc_df['DollarValue'].iloc[i]
        if cumulative_value >= dollar_bar_size:
            bar_indices.append(i)
            cumulative_value = 0
    
    # Create a new dataframe with dollar bars
    dollar_bars = []
    for i in range(len(bar_indices) - 1):
        start_idx = bar_indices[i]
        end_idx = bar_indices[i + 1]
        
        dollar_bar = {
            'Open': ohlc_df['Open'].iloc[start_idx],
            'High': ohlc_df['High'].iloc[start_idx:end_idx].max(),
            'Low': ohlc_df['Low'].iloc[start_idx:end_idx].min(),
            'Close': ohlc_df['Close'].iloc[end_idx-1],
            'Volume': ohlc_df['Volume'].iloc[start_idx:end_idx].sum(),
            'Quote volume': ohlc_df['Quote volume'].iloc[start_idx:end_idx].sum(),
            'Trade count': ohlc_df['Trade count'].iloc[start_idx:end_idx].sum(),
            'Taker base volume': ohlc_df['Taker base volume'].iloc[start_idx:end_idx].sum(),
            'Taker quote volume': ohlc_df['Taker quote volume'].iloc[start_idx:end_idx].sum()
        }
        
        if isinstance(ohlc_df.index, pd.DatetimeIndex):
            dollar_bar['Open Time'] = ohlc_df.index[start_idx]
            dollar_bar['Close Time'] = ohlc_df.index[end_idx-1] - pd.Timedelta(milliseconds=1)
        elif 'Open Time' in ohlc_df.columns:
            dollar_bar['Open Time'] = ohlc_df['Open Time'].iloc[start_idx]
            dollar_bar['Close Time'] = ohlc_df['Open Time'].iloc[end_idx] - pd.Timedelta(milliseconds=1)
        
        dollar_bars.append(dollar_bar)
    
    dollar_bars_df = pd.concat([pd.DataFrame([bar]) for bar in dollar_bars], ignore_index=True)
    
    return dollar_bars_df

# Create a simple function to simplify the number so we can use it in our column names
def simplify_number(num):
    """
    Simplifies a large number by converting it to a shorter representation with a suffix (K, M, B).
    simplify_number(1000) -> 1K
    """
    suffixes = ['', 'K', 'M', 'B']
    suffix_index = 0

    while abs(num) >= 1000 and suffix_index < len(suffixes) - 1:
        num /= 1000.0
        suffix_index += 1

    suffix = suffixes[suffix_index] if suffix_index > 0 else ''
    simplified_num = f'{int(num)}{suffix}'

    return simplified_num

def merge_and_fill_dollar_bars(original_df, dollar_bars_df, dollar_bar_size):
    # Add prefix to column names in dollar bars dataframe
    dollar_bar_prefix = f'db_{simplify_number(dollar_bar_size)}_'
    dollar_bars_df_renamed = dollar_bars_df.add_prefix(dollar_bar_prefix)

    # Convert 'Open Time' columns to pandas datetime format and set them as index
    dollar_bars_df_renamed.index = pd.to_datetime(dollar_bars_df_renamed[dollar_bar_prefix + 'Open Time'])

    # Merge the dataframes on the index
    merged_df = original_df.merge(dollar_bars_df_renamed, how='left', left_index=True, right_index=True)

    # Set the flag for a new dollar bar with prefix
    merged_df[dollar_bar_prefix + 'NewDBFlag'] = ~merged_df[dollar_bar_prefix + 'Close'].isna()

    # Forward fill the NaN values for all columns except the new dollar bar flag
    columns_to_ffill = [col for col in merged_df.columns if col != dollar_bar_prefix + 'NewDBFlag']
    merged_df[columns_to_ffill] = merged_df[columns_to_ffill].fillna(method='ffill')

    # Fill the remaining NaN values in the new dollar bar flag column with False
    merged_df[dollar_bar_prefix + 'NewDBFlag'] = merged_df[dollar_bar_prefix + 'NewDBFlag'].fillna(False)
    
    # Assign the renamed 'Open Time' column back to the dataframe
    merged_df[dollar_bar_prefix + 'Open Time'] = merged_df[dollar_bar_prefix + 'Open Time']

    return merged_df





# Calculate Dollar Bars
Calc Dollar bars and then add technical analysis features

Uncomment this section if you want to run different size dollar bars

In [4]:
# futures_1m = vbt.BinanceData.load('/Users/ericervin/Documents/Coding/data-repository/data/BTCUSDT_1m_futures.pkl')
# futures_1m_df = futures_1m.get()

In [5]:
# dollar_bar_size = 90_000_000
# btc_dollar_bars = dollar_bar_func(futures_1m_df, dollar_bar_size=dollar_bar_size)
# btc_dollar_bars.index = pd.to_datetime(btc_dollar_bars['Open Time'])
# btc_dollar_bars.shape

In [6]:
# Convert the dataframe back into a vbt data object
# btc_90M_db_vbt = vbt.BinanceData.from_data(btc_dollar_bars)


In [7]:
# Save the dollarbars to a pickle file
# btc_90M_db_vbt.save('data/btc_90M_db_vbt.pkl')

# Load the dollar bars from pickle file
Take a small slice of the data for train/testing and leave some to be out of sample

In [8]:
btc_90M_db_vbt = vbt.BinanceData.load('data/btc_90M_db_vbt.pkl')

data = btc_90M_db_vbt['2021-01-01':'2023-01-01']
outofsample_data = btc_90M_db_vbt['2023-01-01':'2023-06-03']
print(data.shape)
print(outofsample_data.shape)
# Wherever you saved the pickle file
# data_path = 'data/BTCUSDT_1m_futures2.pkl'
# min_data = vbt.BinanceData.load(data_path)
data_path = '/Users/ericervin/Documents/Coding/data-repository/data/fixed_BTCUSDT.csv' # comment this out if using the data file above
min_data = vbt.BinanceData.from_csv(data_path)

(105949,)
(19012,)


In [9]:
min_data.get()

Unnamed: 0_level_0,Open,High,Low,Close
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2019-01-01 00:00:00+00:00,3701.23,3703.72,3701.09,3702.46
2019-01-01 00:01:00+00:00,3702.44,3702.63,3695.66,3697.04
2019-01-01 00:02:00+00:00,3699.42,3702.04,3696.08,3698.14
2019-01-01 00:03:00+00:00,3697.49,3698.19,3695.97,3696.51
2019-01-01 00:04:00+00:00,3697.20,3697.62,3695.00,3696.32
...,...,...,...,...
2023-09-13 18:15:00+00:00,26141.87,26141.87,26137.62,26137.62
2023-09-13 18:16:00+00:00,26137.62,26144.30,26136.37,26144.30
2023-09-13 18:17:00+00:00,26144.29,26153.27,26144.29,26153.27
2023-09-13 18:18:00+00:00,26153.27,26160.00,26153.26,26156.86


In [10]:
# List all the ta features we can use
print(dir(vbt.indicators))




['ADX', 'ATR', 'BBANDS', 'IF', 'IndicatorBase', 'IndicatorFactory', 'MA', 'MACD', 'MSD', 'OBV', 'OLS', 'PATSIM', 'PIVOTINFO', 'RSI', 'SIGDET', 'STOCH', 'SUPERTREND', 'TYPE_CHECKING', 'VWAP', '__all__', '__builtins__', '__cached__', '__doc__', '__exclude_from__all__', '__file__', '__import_if_installed__', '__loader__', '__name__', '__package__', '__path__', '__spec__', 'configs', 'custom', 'enums', 'expr', 'factory', 'flex_col_param_config', 'flex_elem_param_config', 'indicator', 'nb', 'pandas_ta', 'ta', 'talib', 'talib_', 'talib_func', 'talib_plot_func', 'techcon', 'technical', 'wqa101']


In [11]:
vbt.phelp(vbt.SUPERTREND.run)

SUPERTREND.run(
    high,
    low,
    close,
    period=Default(value=7),
    multiplier=Default(value=3),
    short_name='supertrend',
    hide_params=None,
    hide_default=True,
    **kwargs
):
    Run `SUPERTREND` indicator.
    
    * Inputs: `high`, `low`, `close`
    * Parameters: `period`, `multiplier`
    * Outputs: `trend`, `direction`, `long`, `short`
    
    Pass a list of parameter names as `hide_params` to hide their column levels, or True to hide all.
    Set `hide_default` to False to show the column levels of the parameters with a default value.
    
    Other keyword arguments are passed to `SUPERTREND.run_pipeline`.


In [12]:
data.get()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Quote volume,Trade count,Taker base volume,Taker quote volume,Open Time,Close Time
Open Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2021-01-01 00:00:00+00:00,28948.19,29045.93,28706.00,28786.75,3181.960,9.183460e+07,24166,1277.389,3.689901e+07,2021-01-01 00:00:00+00:00,2021-01-01 00:13:59.999000+00:00
2021-01-01 00:15:00+00:00,28786.92,28968.49,28752.30,28934.15,3338.349,9.634788e+07,28697,1808.997,5.221244e+07,2021-01-01 00:15:00+00:00,2021-01-01 00:42:59.999000+00:00
2021-01-01 00:44:00+00:00,28934.15,29122.08,28910.94,29087.19,2569.563,7.457184e+07,23009,1502.982,4.362363e+07,2021-01-01 00:44:00+00:00,2021-01-01 01:06:59.999000+00:00
2021-01-01 01:08:00+00:00,29088.42,29294.00,29087.51,29284.95,3368.618,9.838166e+07,18390,2106.629,6.153788e+07,2021-01-01 01:08:00+00:00,2021-01-01 01:11:59.999000+00:00
2021-01-01 01:13:00+00:00,29284.95,29499.00,29242.98,29446.35,3055.165,8.979413e+07,17538,1907.273,5.607186e+07,2021-01-01 01:13:00+00:00,2021-01-01 01:13:59.999000+00:00
...,...,...,...,...,...,...,...,...,...,...,...
2023-01-01 18:06:00+00:00,16583.80,16599.00,16578.80,16598.60,4952.094,8.215245e+07,27824,2450.226,4.064871e+07,2023-01-01 18:06:00+00:00,2023-01-01 19:15:59.999000+00:00
2023-01-01 19:17:00+00:00,16598.70,16615.80,16584.80,16597.60,6604.908,1.096692e+08,33917,3598.985,5.976022e+07,2023-01-01 19:17:00+00:00,2023-01-01 20:06:59.999000+00:00
2023-01-01 20:08:00+00:00,16597.60,16618.80,16584.10,16588.10,5480.869,9.099853e+07,31450,2440.154,4.051684e+07,2023-01-01 20:08:00+00:00,2023-01-01 21:09:59.999000+00:00
2023-01-01 21:11:00+00:00,16588.20,16605.40,16579.10,16594.40,5456.130,9.052970e+07,36482,2717.704,4.509331e+07,2023-01-01 21:11:00+00:00,2023-01-01 22:56:59.999000+00:00


In [49]:
supert_period = 7*150
supert_crossed_below = data.close.vbt.crossed_below(data.run('supertrend', period=supert_period).trend) #.vbt.signals.plot().show_svg()
supert_crossed_above = data.close.vbt.crossed_above(data.run('supertrend', period=supert_period).trend) #.vbt.signals.plot().show_svg()
print("The number of times it crossed above: ", supert_crossed_above.sum())
print("The number of times it crossed below: ", supert_crossed_below.sum())

The number of times it crossed above:  1369
The number of times it crossed below:  1369


In [19]:
min_data.shape

(2472140,)

In [50]:
shifted_crossed_above = supert_crossed_above.vbt.signals.fshift()
shifted_crossed_below = supert_crossed_below.vbt.signals.fshift()

In [51]:
supertrend_pf = vbt.Portfolio.from_signals(
    close           =data.close, 
    open            =data.open,
    high            =data.high,
    low             =data.low,
    entries         =shifted_crossed_above,
    exits           =shifted_crossed_below, 
    short_entries   =shifted_crossed_below,
    short_exits     =shifted_crossed_above,
    tsl_th          =0.003,
    tsl_stop        =0.0015,
    # sl_stop=0.05,
    )
print(supertrend_pf.stats())
# supertrend_pf.plot().show_svg()

Start                         2021-01-01 00:00:00+00:00
End                           2023-01-01 22:58:00+00:00
Period                                           105949
Start Value                                       100.0
Min Value                                     97.575978
Max Value                                   5883.282595
End Value                                   5880.999978
Total Return [%]                            5780.999978
Benchmark Return [%]                          -42.27761
Total Time Exposure [%]                       16.001095
Max Gross Exposure [%]                       108.448703
Max Drawdown [%]                               9.145439
Max Drawdown Duration                            4226.0
Total Orders                                       5054
Total Fees Paid                                     0.0
Total Trades                                       2733
Win Rate [%]                                  84.924991
Best Trade [%]                                 7

In [52]:
supertrend_pf.resample('1d').plot().show()
# supertrend_pf.plot().show()

## Test it out with a Roll Your Own SuperTrend to make sure things are correct

In [57]:
import pandas as pd
import numpy as np

def get_med_price(high, low):
    return (high + low) / 2

def get_atr(high, low, close, period):
    tr0 = abs(high - low)
    tr1 = abs(high - close.shift())
    tr2 = abs(low - close.shift())
    tr = pd.concat((tr0, tr1, tr2), axis=1).max(axis=1)
    atr = tr.ewm(alpha=1 / period, adjust=False, min_periods=period).mean()
    return atr

def get_basic_bands(med_price, atr, multiplier):
    matr = multiplier * atr
    upper = med_price + matr
    lower = med_price - matr
    return upper, lower

def get_final_bands(close, upper, lower):
    trend = pd.Series(np.full(close.shape, np.nan), index=close.index)
    dir_ = pd.Series(np.full(close.shape, 1), index=close.index)
    long = pd.Series(np.full(close.shape, np.nan), index=close.index)
    short = pd.Series(np.full(close.shape, np.nan), index=close.index)

    for i in range(1, close.shape[0]):
        if close.iloc[i] > upper.iloc[i - 1]:
            dir_.iloc[i] = 1
        elif close.iloc[i] < lower.iloc[i - 1]:
            dir_.iloc[i] = -1
        else:
            dir_.iloc[i] = dir_.iloc[i - 1]
            if dir_.iloc[i] > 0 and lower.iloc[i] < lower.iloc[i - 1]:
                lower.iloc[i] = lower.iloc[i - 1]
            if dir_.iloc[i] < 0 and upper.iloc[i] > upper.iloc[i - 1]:
                upper.iloc[i] = upper.iloc[i - 1]

        if dir_.iloc[i] > 0:
            trend.iloc[i] = long.iloc[i] = lower.iloc[i]
        else:
            trend.iloc[i] = short.iloc[i] = upper.iloc[i]
            
    return trend, dir_, long, short

def supertrend(high, low, close, period=7, multiplier=3):
    med_price = get_med_price(high, low)
    atr = get_atr(high, low, close, period)
    upper, lower = get_basic_bands(med_price, atr, multiplier)
    return get_final_bands(close, upper, lower)

supert, superd, superl, supers = supertrend(
    data.high,
    data.low,
    data.close,
    period=supert_period,
)

In [58]:
supertrend_crossed_above = data.close.vbt.crossed_above(supert)
supertrend_crossed_below = data.close.vbt.crossed_below(supert)
shifted_crossed_above = supertrend_crossed_above.vbt.signals.fshift()
shifted_crossed_below = supertrend_crossed_below.vbt.signals.fshift()


In [61]:
ryo_supertrend_pf = vbt.Portfolio.from_signals(
    close           =data.close, 
    open            =data.open,
    high            =data.high,
    low             =data.low,
    # entries         =supertrend_crossed_above,
    # exits           =supertrend_crossed_below, 
    # short_entries   =supertrend_crossed_below,
    # short_exits     =supertrend_crossed_above,
    entries         =shifted_crossed_above,
    exits           =shifted_crossed_below, 
    short_entries   =shifted_crossed_below,
    short_exits     =shifted_crossed_above,
    tsl_th          =0.003,
    tsl_stop        =0.0015,
    # sl_stop=0.05,
    )
print(ryo_supertrend_pf.stats())
# supertrend_pf.plot().show_svg()

Start                         2021-01-01 00:00:00+00:00
End                           2023-01-01 22:58:00+00:00
Period                                           105949
Start Value                                       100.0
Min Value                                     95.786596
Max Value                                   5942.906704
End Value                                   5940.600953
Total Return [%]                            5840.600953
Benchmark Return [%]                          -42.27761
Total Time Exposure [%]                       16.016196
Max Gross Exposure [%]                       108.448703
Max Drawdown [%]                               9.145439
Max Drawdown Duration                            4226.0
Total Orders                                       5050
Total Fees Paid                                     0.0
Total Trades                                       2731
Win Rate [%]                                  84.913951
Best Trade [%]                                 7

In [62]:
supertrend = data.run("supertrend").trend
supertrend

Open Time
2021-01-01 00:00:00+00:00             NaN
2021-01-01 00:15:00+00:00             NaN
2021-01-01 00:44:00+00:00             NaN
2021-01-01 01:08:00+00:00             NaN
2021-01-01 01:13:00+00:00             NaN
                                 ...     
2023-01-01 18:06:00+00:00    16621.892389
2023-01-01 19:17:00+00:00    16621.892389
2023-01-01 20:08:00+00:00    16621.892389
2023-01-01 21:11:00+00:00    16621.892389
2023-01-01 22:58:00+00:00    16621.892389
Length: 105949, dtype: float64

# Create PSAR and other functions

In [None]:

def get_psar_signal(high, low, close, af0=0.02, step=0.02, max_=0.2):
    """
    ... [same docstring as before] ...
    
    Returns:
    - DataFrame containing:
      * signal: buy signals (1), sell signals (-1), and no action (0).
      * close_long_price: Level at which a long position should be closed or reversed to short.
      * close_short_price: Level at which a short position should be closed or reversed to long.
    """
    data = pd.concat([high, low, close], axis=1)
    data.columns = ['High', 'Low', 'Close']
    psar = data.ta.psar(af0, step, max_)

    # Dynamically determine the column names based on the hyperparameters
    close_long_price = f"PSARl_{af0}_{max_}"   # or 'floor'
    close_short_price = f"PSARs_{af0}_{max_}"  # or 'ceiling'
    psar_reversal_col = f"PSARr_{af0}_{max_}"

    # Calculate the signals
    signal = np.zeros(len(psar))
    signal = np.where((psar[psar_reversal_col] == 1) & (psar[close_long_price].shift(1).notna()), 1, signal)  # buy signal
    signal = np.where((psar[psar_reversal_col] == 1) & (psar[close_short_price].shift(1).notna()), -1, signal)  # sell signal

    result = pd.DataFrame({
        'signal': signal,
        'close_long_price': psar[close_long_price],
        'close_short_price': psar[close_short_price]
    }, index=data.index)

    return result


psar_signal = get_psar_signal(data.high, data.low, data.close)

psar_pf = vbt.Portfolio.from_signals(data.close, entries=psar_signal['signal'] == 1, exits=psar_signal['signal'] == -1)
print(psar_pf.stats())
psar_pf.plot().show()

In [None]:
fig = data.close.vbt.plot()
psar_signal.vbt.plot(fig=fig).show()

In [None]:
import pandas as pd
import numpy as np

def get_psar_signal(high, low, close, af0=0.02, step=0.02, max_=0.2, resample_period=None):
    """
    Compute PSAR signals with optional resampling.
    
    Args:
    ... [same docstring arguments as before] ...
    
    - resample_period (str, optional): If provided, the data will be resampled to this period. E.g. '2H' for 2 hours.

    Returns:
    - DataFrame containing:
      * signal: buy signals (1), sell signals (-1), and no action (0).
      * close_long_price: Level at which a long position should be closed or reversed to short.
      * close_short_price: Level at which a short position should be closed or reversed to long.
    """
    data = pd.concat([high, low, close], axis=1)
    data.columns = ['High', 'Low', 'Close']

    # Resample data if resample_period is provided
    if resample_period:
        data = data.resample(resample_period).agg({'High': 'max', 'Low': 'min', 'Close': 'last'})
    
    psar = data.ta.psar(af0, step, max_)

    # Dynamically determine the column names based on the hyperparameters
    close_long_price = f"PSARl_{af0}_{max_}"   # or 'floor'
    close_short_price = f"PSARs_{af0}_{max_}"  # or 'ceiling'
    psar_reversal_col = f"PSARr_{af0}_{max_}"

    # Calculate the signals
    signal = np.zeros(len(psar))
    signal = np.where((psar[psar_reversal_col] == 1) & (psar[close_long_price].shift(1).notna()), 1, signal)  # buy signal
    signal = np.where((psar[psar_reversal_col] == 1) & (psar[close_short_price].shift(1).notna()), -1, signal)  # sell signal

    result = pd.DataFrame({
        'signal': signal,
        'close_long_price': psar[close_long_price],
        'close_short_price': psar[close_short_price]
    }, index=data.index)

    # Reindex to the original timeframe and forward fill if resampling was done
    if resample_period:
        result = result.reindex(high.index).ffill()

    return result

psar_signal = get_psar_signal(data.high, data.low, data.close, resample_period='2H')

psar_pf = vbt.Portfolio.from_signals(data.close, entries=psar_signal['signal'] == 1, exits=psar_signal['signal'] == -1)
print(psar_pf.stats())
psar_pf.plot().show()


In [None]:
fig = data.close.vbt.plot()
psar_signal.vbt.plot(fig=fig).show()

In [None]:
# drop_cols = ['Open Time', 'Close Time', 'Open', 'High', 'Low', 'Volume', 'Quote volume', 'Trade count',]
drop_cols = ['Open Time', 'Close Time']

def _add_pivot_trends(X, data, pivot_up_th, pivot_down_th, pivot_up_th2, pivot_down_th2, pivot_up_th3, pivot_down_th3):
    pivot_info = data.run("pivotinfo", up_th=pivot_up_th, down_th=pivot_down_th)
    binary_pivot_labels = np.where(data.close > pivot_info.conf_value,1,0) # Create binary labels for pivot points
    X['trend'] = binary_pivot_labels # add pivot label as a feature
    
    pivot_info2 = data.run("pivotinfo", up_th=pivot_up_th2, down_th=pivot_down_th2)
    binary_pivot_labels2 = np.where(data.close > pivot_info2.conf_value,1,0) # Create binary labels for pivot points
    X['trend2'] = binary_pivot_labels2 # add pivot label as a feature
    
    pivot_info3 = data.run("pivotinfo", up_th=pivot_up_th3, down_th=pivot_down_th3)
    binary_pivot_labels3 = np.where(data.close > pivot_info3.conf_value,1,0) # Create binary labels for pivot points
    X['trend3'] = binary_pivot_labels3 # add pivot label as a feature
    
    return X

def _add_ta_features(X, data, lookback_window):

    # Add some TA features
    X['supert'] = data.run("supertrend", period=lookback_window).supert
    X['supert_cross_up'] = data.close.vbt.crossed_above(data.run('supertrend', period=lookback_window).supert)
    X['supert_cross_down'] = data.close.vbt.crossed_below(data.run('supertrend', period=lookback_window).supert)
    X['vwap'] = data.run("VWAP").vwap
    X['rsi'] = data.run("rsi", window=lookback_window).rsi
    X['rsi_overbought'] = pd.Series(np.where(X['rsi'] > 60, 1, 0), index=X.index)
    X['rsi_oversold'] = pd.Series(np.where(X['rsi'] < 40, 1, 0), index=X.index)
    X['bb_width'] = data.run("bbands", window=lookback_window).bandwidth
    X['bb_width_pct'] = data.run("bbands", window=lookback_window).percent_b
    X['fast_k'] = data.run("stoch", fast_k_window=lookback_window, slow_k_window=lookback_window*2, slow_d_window=lookback_window*2).fast_k
    X['slow_k'] = data.run("stoch", fast_k_window=lookback_window, slow_k_window=lookback_window*2, slow_d_window=lookback_window*2).slow_k
    X['slow_k_trending_up'] = X['slow_k'] > X['slow_k'].shift(lookback_window)
    X['slow_k_trending_down'] = X['slow_k'] < X['slow_k'].shift(lookback_window)
    X['slow_d'] = data.run("stoch", fast_k_window=lookback_window, slow_k_window=lookback_window*2, slow_d_window=lookback_window*2).slow_d
    X['slow_k_over_slow_d'] = X['slow_k'] > X['slow_d']
    X['slow_k_under_slow_d'] = X['slow_k'] < X['slow_d']
    return X

def _add_historical_returns(X, data, lookback_window):
    # Add in historical returns
    X['pct_change_1'] = data.close.pct_change(1)
    X['pct_change_5'] = data.close.pct_change(5)
    X['pct_change_10'] = data.close.pct_change(10)
    X['pct_change_20'] = data.close.pct_change(20)
    X['pct_change_40'] = data.close.pct_change(40)
    X['pct_change_60'] = data.close.pct_change(60)
    X['pct_change_100'] = data.close.pct_change(100)
    X['pct_change_160'] = data.close.pct_change(160)
    X['pct_change_260'] = data.close.pct_change(260)
    X['pct_change_420'] = data.close.pct_change(420)
    
    # Add in the relative change as a boolean
    X['yesterday_up'] = np.where(X['pct_change_160'] > 0, 1, 0) # Using 160 bar lookback as a proxy for yesterday
    X['yesterday_down'] = np.where(X['pct_change_160'] < 0, 1, 0) # Using 160 bar lookback as a proxy for yesterday
    X['up_down_run_160'] = np.sign(data.close.diff(160)).rolling(lookback_window).sum()
    X['up_down_run_1'] = np.sign(data.close.diff(1)).rolling(lookback_window).sum()
    # Add in runs of up/down days TODO: Need to make this a function and do it properly
    # X['up_day_count'] = data.close.vbt.rolling_count(data.close > data.close.shift(160), window=lookback_window)
    
    X['mid_range_momentum']= pd.Series(np.where(X['pct_change_100'] > X['pct_change_420'], True, False), index=X.index)
    X['short_range_momentum']= pd.Series(np.where(X['pct_change_20'] > X['pct_change_40'], True, False), index=X.index)
    X['short_over_long_momentum'] = pd.Series(np.where(X['pct_change_20'] > X['pct_change_420'], True, False), index=X.index)
    X['momentum_trending'] = pd.Series(np.where(X['pct_change_20'] > X['pct_change_20'].shift(lookback_window), True, False), index=X.index)
    # Label large moves
    X['large_move_up'] = np.where(data.close > data.close.shift(lookback_window) * 1.05, 1, 0)
    X['large_move_down'] = np.where(data.close < data.close.shift(lookback_window) * 0.95, 1, 0)  
    # Drop the time columns
    return X

def _add_time_features(X):
    X['dayofmonth']  = X.index.day
    X['month']       = X.index.month
    X['year']        = X.index.year
    X['hour']        = X.index.hour
    X['minute']      = X.index.minute
    X['dayofweek']   = X.index.dayofweek   
    return X
def _handle_missing_data(df):
    df = df.replace([-np.inf, np.inf], np.nan) # replace inf with nan
    invalid_column_mask = df.isnull().all(axis=0)
    df = df.loc[:, ~invalid_column_mask] # drop invalid columns
    invalid_row_mask = df.isnull().any(axis=1) # drop rows that have nan in any column
    df = df.loc[~invalid_row_mask]
    return df

def _create_target(X, periods_future, base_predictions=None, meta=False):
    # Now we are trying to generate future price predictions so we will set the y labels to the price change n periods in the future
    y = (X.Close.shift(-periods_future) / X.Close - 1) # future price change

    if base_predictions is not None:
        # If base predictions are available, add them as a column to the data
        y = y.to_frame('future return')
        y['base prediction'] = base_predictions

    # if meta, we want to predict if the price change will be positive or negative
    if meta:
        y = (y > 0).astype(int)
    
    return y
# drop_cols = ['Open Time', 'Close Time']

def _generate_features(data, lookback_window, pivot_up_th, pivot_down_th, drop_cols):
    pivot_up_th2 = pivot_up_th * 1.5
    pivot_down_th2 = pivot_down_th * 1.5
    pivot_up_th3 = pivot_up_th * 2
    pivot_down_th3 = pivot_down_th * 2
    
    # Generate the features (X)
    X = data.get()

    # Add all of the World Quant Alphas
    alphas = data.run(["wqa101_%d" % i for i in range(1, 102)], missing_index="drop") # 101 strategies
    X = pd.concat([X, alphas], axis=1)
    # Replace NaNs with 0s
    X = X.fillna(0)
    # Add pivot trends
    X = _add_pivot_trends(X, data, pivot_up_th, pivot_down_th, pivot_up_th2, pivot_down_th2, pivot_up_th3, pivot_down_th3)
    # Add TA features
    X = _add_ta_features(X, data, lookback_window)
    # Add historical returns
    X = _add_historical_returns(X, data, lookback_window)
    # Add time features
    X = _add_time_features(X)

    return X

def prepare_data(data, base_predictions=None, meta=False, pivot_up_th=0.10, pivot_down_th=0.10, periods_future=150, drop_cols=[]):
    lookback_window = 14*periods_future  # Number of dollar bars we are predicting into the future times the typical RSI lookback window of 14
    X = _generate_features(data, lookback_window, pivot_up_th, pivot_down_th, drop_cols)
    
    # Create y using cleaned X data
    y = _create_target(X, periods_future, base_predictions, meta)

    # Adjust X to match the length of y by removing the last rows
    X = X.iloc[:-periods_future]

    # Handle missing data in both X and y
    X = _handle_missing_data(X)

    # Convert column names to string
    X.columns = X.columns.astype(str)
    
    # Reindex y based on X's index to ensure they match
    y = y.reindex(X.index)

    assert len(X) == len(y)  # This will raise an error if X and y are not the same length
    return X, y


def create_pipeline(X, model='xgb', task='regression'):
    """
    Create a scikit-learn pipeline.

    Parameters:
    model (str): The model to use in the pipeline. Default is 'xgb' (XGBoost).

    Returns:
    pipeline (Pipeline): The scikit-learn pipeline.
    """
    X_shape = X.shape
    # Construct the pipeline
    steps = [
        ('imputation', SimpleImputer(strategy='mean')),  # Imputation replaces missing values
        ('scaler', StandardScaler()),  # StandardScaler normalizes the data
        # ('pca', PCA(n_components=15))  # PCA reduces dimensionality
    ]
    if task == 'classification':
        if model == 'xgb':
            steps.append(('model', XGBClassifier()))  # XGBoost classification
        elif model == 'logistic':
            steps.append(('model', LogisticRegression()))  # Logistic regression
        elif model == 'svc':
            steps.append(('model', SVC()))  # Support Vector Classification
        # Add more classification models as needed
        else:
            raise ValueError("Invalid model name for classification. Choose from 'xgb', 'logistic', 'svc'.")
    elif task == 'regression':
        if model == 'xgb':
            steps.append(('model', XGBRegressor(objective='reg:squarederror')))  # XGBoost regression is used as the prediction model
        elif model == 'ridge':
            steps.append(('model', Ridge()))  # Ridge regression
        elif model == 'linear':
            steps.append(('model', LinearRegression()))  # Linear regression
        elif model == 'logistic':
            steps.append(('model', LogisticRegression()))  # Logistic regression
        elif model == 'lasso':
            steps.append(('model', Lasso()))  # Lasso regression
        elif model == 'elasticnet':
            steps.append(('model', ElasticNet()))  # ElasticNet regression
        elif model == 'svr':
            steps.append(('model', SVR()))  # Support Vector Regression
        else:
            raise ValueError("Invalid model name. Choose from 'xgb', 'ridge', 'linear', 'logistic', 'lasso', 'elasticnet', 'svr'.")
    else:
        raise ValueError("Invalid task. Choose from 'classification', 'regression'.")

    pipeline = Pipeline(steps)
    
    return pipeline

def create_cv(X, min_length=600, offset=200, split=-200, set_labels=["train", "test"]):
    """
    Create a cross-validation splitter.

    Parameters:
    X (DataFrame): The feature matrix.
    min_length (int): The minimum length of a sample for cross-validation.
    offset (int): The offset used in cross-validation splitting.
    split (int): Index at which to split the data in cross-validation.
    set_labels (list): Labels for the train and test sets in cross-validation.

    Returns:
    cv_splitter (SKLSplitter): The cross-validation splits created from cv.get_splitter(X).
    cv (SKLSplitter): The cross-validation object.
    """

    # Cross-validate Creates a cross-validation object with all the indexes for each cv split
    cv = vbt.SKLSplitter("from_expanding", min_length=min_length, offset=offset, split=split, set_labels=set_labels)
    cv_splitter = cv.get_splitter(X)
    
    return cv_splitter, cv

def create_cv_with_gap(X, min_length=600, test_amount=200, gap = 150, set_labels=["train", "test"]):
    """
    Create a cross-validation splitter.

    Parameters:
    X (DataFrame): The feature matrix.
    min_length (int): The minimum length of a sample for cross-validation.
    offset (int): The offset used in cross-validation splitting.
    split (int): Index at which to split the data in cross-validation.
    set_labels (list): Labels for the train and test sets in cross-validation.

    Returns:
    cv_splitter (SKLSplitter): The cross-validation splits created from cv.get_splitter(X).
    cv (SKLSplitter): The cross-validation object.
    """

    # Cross-validate Creates a cross-validation object with all the indexes for each cv split
    cv = vbt.SKLSplitter("from_expanding", 
                         min_length=min_length, 
                         offset=test_amount, 
                         split=(1.0, vbt.RelRange(length=gap, is_gap=True), test_amount), 
                         set_labels=set_labels,
                         split_range_kwargs=dict(backwards=True)
                         )
    cv_splitter = cv.get_splitter(X)
    
    return cv_splitter, cv

def create_rolling_cv(X, length=2000, split=0.90, offset=True, offsetlen=0, set_labels=["train", "test"]):
    """
    Create a cross-validation splitter.

    Parameters:
    X (DataFrame): The feature matrix.
    min_length (int): The minimum length of a sample for cross-validation.
    split (float): percent of window to split training vs testing.
    set_labels (list): Labels for the train and test sets in cross-validation.
    offset (bool): Whether to offset the splits, True shifts the window forward by only the test number.

    Returns:
    cv_splitter (SKLSplitter): The cross-validation splits created from cv.get_splitter(X).
    cv (SKLSplitter): The cross-validation object.
    """
    if offset:
        offsetlen = 2*(length * split) - length
        cv = vbt.SKLSplitter("from_rolling", length=length, split=split, offset=-offsetlen, offset_anchor="prev_end", set_labels=set_labels)
        cv_splitter = cv.get_splitter(X) 
        return cv_splitter, cv
    # Cross-validate Creates a cross-validation object with all the indexes for each cv split
    else:
        cv = vbt.SKLSplitter("from_rolling", length=length, split=split, set_labels=set_labels) # offset=-offsetlen, offset_anchor="prev_end",
        cv_splitter = cv.get_splitter(X) 
        return cv_splitter, cv
    
def create_rolling_cv_with_gap(X, length=500, split=0.70, gap=150, set_labels=["train", "test"]):
    """
    Create a cross-validation splitter.

    Parameters:
    X (DataFrame): The feature matrix.
    length (int): The length of a sample for cross-validation.
    split (float): The percent of the sample to use for training.
    gap (int): The gap between the training and test sets.
    set_labels (list): Labels for the train and test sets in cross-validation.

    Returns:
    cv_splitter (SKLSplitter): The cross-validation splits created from cv.get_splitter(X).
    cv (SKLSplitter): The cross-validation object.
    """
    assert length > gap, "Length must be greater than gap"

    split_size = int((length - gap) * split) # Total length of the set minus the gap times the split percent is training set
    test_size = length - split_size - gap # Total length minus the training set minus the gap is the test set
    offset = -(split_size-test_size) # Offset the split by the difference between the training and test set this gets the next test set to start where the last one ended

    cv = vbt.SKLSplitter("from_rolling", 
                        length=length,
                        split=(split_size, vbt.RelRange(length=gap, is_gap=True), 1.0),
                        offset=offset,
                        set_labels=set_labels)
    cv_splitter = cv.get_splitter(X)
    return cv_splitter, cv

def cross_validate_and_train(pipeline, X, y, cv_splitter, model_name="", verbose_interval=10, n_clusters=6, clustering=False):
    # Predictions
    X_slices = cv_splitter.take(X)
    y_slices = cv_splitter.take(y)
    
    # Print total number of splits
    total_splits = len(X_slices.index.unique(level="split"))
    print(f"Total number of cross-validation splits: {total_splits}")

    test_labels = []
    test_preds = []
    for split in X_slices.index.unique(level="split"):  
        X_train_slice= X_slices[(split, "train")]  
        y_train_slice= y_slices[(split, "train")] 
        X_test_slice = X_slices[(split, "test")]
        y_test_slice = y_slices[(split, "test")]
        
        # If clustering is enabled
        if clustering:
            # Fit the KMeans clustering algorithm on the training data using only the original columns in X_slices
            kmeans = KMeans(n_clusters=n_clusters, random_state=0)
            kmeans.fit(X_train_slice)
            
            # Get the cluster labels for the training data using the KMeans clustering algorithm fitted on the training data
            train_cluster_labels = kmeans.predict(X_train_slice)
            
            # Add the "cluster" column to the training data using the cluster labels obtained above
            X_train_slice["cluster"] = train_cluster_labels

            # Get the cluster labels for the test data using the KMeans clustering algorithm fitted on the training data
            test_cluster_labels = kmeans.predict(X_test_slice)
            
            # Get the cluster labels and their counts for the test data
            test_cluster_counts = Counter(test_cluster_labels)

            # Add the "cluster" column to the test data using the cluster labels obtained above
            X_test_slice["cluster"] = test_cluster_labels

        # Fit the pipeline on the training data
        pipeline.fit(X_train_slice, y_train_slice)
        
        # Make predictions on the test data
        test_pred = pipeline.predict(X_test_slice)  
        test_pred = pd.Series(test_pred, index=y_test_slice.index)
        test_labels.append(y_test_slice)
        test_preds.append(test_pred)

        # Only print the MSE every 'verbose_interval' splits
        if split % verbose_interval == 0:
            print(f"{model_name} Split {split} Mean Squared Error: {mean_squared_error(y_test_slice, test_pred)}")

            if clustering:
                # Print the cluster labels and their counts
                print(f"Cluster Sizes:")
                for label, count in test_cluster_counts.items():
                    print(f"Cluster {label}: {count}")

    # Concatenate the test labels and predictions into a single Series
    test_labels = pd.concat(test_labels).rename("labels")  
    test_preds = pd.concat(test_preds).rename("preds")
    
    # Drop Duplicates
    test_labels = test_labels[~test_labels.index.duplicated(keep='first')]
    test_preds = test_preds[~test_preds.index.duplicated(keep='first')]
    
    return pipeline, test_labels, test_preds

def evaluate_predictions(test_labels, test_preds, model_name="", meta=False):
    if meta:  # Classification metrics for the metamodel
        acc = accuracy_score(test_labels, test_preds)
        prec = precision_score(test_labels, test_preds)
        recall = recall_score(test_labels, test_preds)
        f1 = f1_score(test_labels, test_preds)
        auc_roc = roc_auc_score(test_labels, test_preds)

        print(f"{model_name} Accuracy: {acc}")
        print(f"{model_name} Precision: {prec}")
        print(f"{model_name} Recall: {recall}")
        print(f"{model_name} F1 Score: {f1}")
        print(f"{model_name} AUC-ROC: {auc_roc}")

        return acc, prec, recall, f1, auc_roc

    else:  # Regression metrics for the original model
        mse = mean_squared_error(test_labels, test_preds)
        rmse = np.sqrt(mse)  # or use mean_squared_error with squared=False
        mae = mean_absolute_error(test_labels, test_preds)
        r2 = r2_score(test_labels, test_preds)

        print(f"{model_name} Mean Squared Error (MSE): {mse}")
        print(f"{model_name} Root Mean Squared Error (RMSE): {rmse}")
        print(f"{model_name} Mean Absolute Error (MAE): {mae}")
        print(f"{model_name} R-squared: {r2}")

        return mse, rmse, mae, r2

import numpy as np

def extract_feature_importance(pipeline, X, clustering=False, top_n=None):
    fitted_model = pipeline.named_steps['model']
    feature_names = X.columns.tolist()
    
    if clustering:
        feature_names.append('cluster')

    # Create a DataFrame using a Dictionary
    feature_names_series = pd.Series(feature_names, name='feature_names')
    feature_importance_series = pd.Series(fitted_model.feature_importances_, name='feature_importance')

    fi_df = pd.concat([feature_names_series, feature_importance_series], axis=1)

    # Check if there are any NaNs in the DataFrame
    missing_values = fi_df.isnull().sum()
    if missing_values.any():
        print("Found missing feature importance. Features are:")
        print(fi_df[fi_df.isnull().any(axis=1)])
    else:
        print("No missing feature importance found.")

    # Sort the DataFrame in order decreasing feature importance
    fi_df.sort_values(by=['feature_importance'], ascending=False, inplace=True)

    # Select top_n features
    if top_n is not None:
        fi_df = fi_df.head(top_n)

    # Define size of bar plot
    plt.figure(figsize=(12,8))
    # Plot bar chart
    plt.barh(fi_df['feature_names'], fi_df['feature_importance'], align='center')
    # Add chart labels
    plt.xlabel('FEATURE IMPORTANCE')
    plt.ylabel('FEATURE NAMES')
    plt.show()

    # Print feature names and importance
    for index, row in fi_df.iterrows():
        print(f"{row['feature_names']}: {row['feature_importance']}")
   
def plot_prediction_vs_actual(x, y, pos_threshold=0.09, neg_threshold=-0.09, title='Test Predictions vs Actual Results'):
    """
    Plots predictions against actual results and calculates the line of best fit.
    Also draws vertical lines at the provided positive and negative thresholds.

    Parameters:
    x (Series): Predictions
    y (Series): Actual results
    pos_threshold (float): Optimal positive threshold
    neg_threshold (float): Optimal negative threshold
    title (str): Title of the plot

    Returns:
    None
    """

    # Create condition masks for different data types
    tp_condition = (x > 0) & (y > 0)  # True positives condition
    tn_condition = (x < 0) & (y < 0)  # True negatives condition
    fp_condition = (x > 0) & (y < 0)  # False positives condition
    fn_condition = (x < 0) & (y > 0)  # False negatives condition

    # Calculate percent in each condition
    tp_percent = (tp_condition.sum() / len(x)) * 100
    tn_percent = (tn_condition.sum() / len(x)) * 100
    fp_percent = (fp_condition.sum() / len(x)) * 100
    fn_percent = (fn_condition.sum() / len(x)) * 100

    # Create scatter plots for each condition with different colors
    plt.scatter(x[tp_condition], y[tp_condition], color='green', alpha=0.5, label='True Positives', s=10)
    plt.scatter(x[tn_condition], y[tn_condition], color='pink', alpha=0.5, label='True Negatives', s=10)
    plt.scatter(x[fp_condition], y[fp_condition], color='grey', alpha=0.1, label='False Positives', s=10)
    plt.scatter(x[fn_condition], y[fn_condition], color='grey', alpha=0.1, label='False Negatives', s=10)

    # Calculate the line of best fit
    coefficients = np.polyfit(x, y, 1)
    polynomial = np.poly1d(coefficients)

    # Generate y-values based on the polynomial
    y_fit = polynomial(x)

    # Plot the line of best fit
    plt.plot(x, y_fit, color='black', label='Line of Best Fit')

    # Draw black dotted lines at x=0 and y=0
    plt.axhline(0, color='black', linestyle='dotted')
    plt.axvline(0, color='black', linestyle='dotted')

    # Draw vertical lines at the optimal thresholds if they are not None
    if pos_threshold is not None:
        plt.axvline(x=pos_threshold, color='green', linestyle='--', label='Positive Threshold')
    if neg_threshold is not None:
        plt.axvline(x=neg_threshold, color='red', linestyle='--', label='Negative Threshold')

    # Add title and labels to the axes
    plt.title(title)
    plt.xlabel('Predictions')
    plt.ylabel('Actual Results')

    # Add a legend
    plt.legend()

    # Print the equation of the line
    slope, intercept = coefficients
    print(f"The equation of the regression line is: y = {slope:.3f}x + {intercept:.3f}")

    # Print the percent in each quadrant
    print(f"\nPercentage of True Positives: {tp_percent:.2f}%")
    print(f"Percentage of True Negatives: {tn_percent:.2f}%")
    print(f"Percentage of False Positives: {fp_percent:.2f}%")
    print(f"Percentage of False Negatives: {fn_percent:.2f}%")

    # Show the plot
    plt.show()
   
def simulate_pf(data, test_preds, open_long_th=0.01, close_long_th=0.0, close_short_th=0.0, open_short_th=-0.01, plot=True):
    # Simulate a portfolio making trades based on predictions
    insample_pf = vbt.Portfolio.from_signals(
    data.close[test_preds.index],  # use only the test set
    entries         = test_preds > open_long_th,  # long when probability of price increase is greater than 2%
    exits           = test_preds < close_long_th,  # long when probability of price increase is greater than 2%
    short_entries   = test_preds < open_short_th,  # long when probability of price increase is greater than 2%
    short_exits     = test_preds > close_short_th,  # short when probability prediction is less than -5%
    # direction="both" # long and short
)
    print(insample_pf.stats())
    if plot==True:
        insample_pf.plot().show()
    return insample_pf

def find_optimal_thresholds(y_true, y_pred, target_percent_pos=70, target_percent_neg=70):
    # Combine y_true and y_pred into a DataFrame
    data = pd.DataFrame({'y_true': y_true, 'y_pred': y_pred})

    # Initialize thresholds to None
    pos_threshold = None
    neg_threshold = None

    # Sort the DataFrame by y_pred in ascending order
    data.sort_values(by='y_pred', ascending=True, inplace=True)

    # For positive threshold
    for index, row in data.iterrows():
        # Slice the DataFrame from the current prediction upwards
        slice_pos = data[data['y_pred'] >= row['y_pred']]

        # Calculate the percentage of true positives in the slice
        percent_true_positives = (slice_pos['y_true'] > 0).sum() / len(slice_pos)

        # If the percentage of true positives is equal to or greater than the target, store the current prediction
        if percent_true_positives >= target_percent_pos / 100:
            pos_threshold = row['y_pred']
            break

    # Sort the DataFrame by y_pred in descending order
    data.sort_values(by='y_pred', ascending=False, inplace=True)

    # For negative threshold
    for index, row in data.iterrows():
        # Slice the DataFrame from the current prediction downwards
        slice_neg = data[data['y_pred'] <= row['y_pred']]

        # Calculate the percentage of true negatives in the slice
        percent_true_negatives = (slice_neg['y_true'] < 0).sum() / len(slice_neg)

        # If the percentage of true negatives is equal to or greater than the target, store the current prediction
        if percent_true_negatives >= target_percent_neg / 100:
            neg_threshold = row['y_pred']
            break

    # Return the positive and negative thresholds
    return pos_threshold, neg_threshold



# Cross Validate with an expanding window

In [None]:
daily_df = min_data.resample('1d')  #.agg({'Open': 'first', 'High': 'max', 'Low': 'min', 'Close': 'last', 'Volume': 'sum', 'Quote volume': 'sum', 'Trade count': 'sum', 'Taker base volume': 'sum', 'Taker quote volume': 'sum'})
hourly_df = min_data.resample('1h') #.agg({'Open': 'first', 'High': 'max', 'Low': 'min', 'Close': 'last', 'Volume': 'sum', 'Quote volume': 'sum', 'Trade count': 'sum', 'Taker base volume': 'sum', 'Taker quote volume': 'sum'})


# daily_df = vbt.BinanceData.from_data(daily_df)
# hourly_df = vbt.BinanceData.from_data(hourly_df)


# data = daily_df
data = hourly_df

In [None]:
# Future Prediction Length
periods_future = prediction_window = 1
pivot_up_th = 0.1
pivot_down_th = 0.05
clustering = False
# Establish the training and testing sets for cross validation
windowsize = 700
split = 0.70
test_amount = int(split * windowsize)
gap = periods_future
strong_percentile = 60 # Enter a trade when the model predicts a return in the top 1% of all returns
weak_percentile = 50 # Exit a trade when the model predicts a return in the top 5% of all returns

# Prep Data
X, y = prepare_data(data, pivot_up_th=pivot_up_th, pivot_down_th=pivot_down_th, periods_future=periods_future, drop_cols = []) # in-sample
Xoos, yoos = prepare_data(outofsample_data, pivot_up_th=pivot_up_th, pivot_down_th=pivot_down_th, periods_future=periods_future, drop_cols=[]) # out-of-sample

# Set up the pipeline and create the cross validation splits
pipeline = create_pipeline(X, model='xgb')
cv_splitter, cv = create_cv_with_gap(X, min_length=windowsize, test_amount=test_amount, gap=periods_future, set_labels=["train", "test"])

# Train and cross-validate
final_pipeline, test_labels, test_preds = cross_validate_and_train(
    pipeline, X, y, cv_splitter, model_name="In-Sample", verbose_interval=5, clustering=clustering)

# Evaluate
mse, rmse, mae, r2 = evaluate_predictions(test_labels, test_preds, model_name="In-Sample")

#  Which percentiles are you looking for?


strong_pos_threshold, strong_neg_threshold = find_optimal_thresholds(y_true=test_labels, y_pred=test_preds, target_percent_pos=strong_percentile, target_percent_neg=strong_percentile)
weak_pos_threshold, weak_neg_threshold = find_optimal_thresholds(y_true=test_labels, y_pred=test_preds, target_percent_pos=weak_percentile, target_percent_neg=weak_percentile)

print(f"Positive threshold with a {strong_percentile}th percent win rate : {strong_pos_threshold}")
print(f"Number of observations where that were true positives: {len(test_labels[test_labels > 0])}")
print(f"Negative threshold with a {strong_percentile}th percent win rate: {strong_neg_threshold}")
print(f"Weaker Positive threshold with a {weak_percentile}th percent win rate : {weak_pos_threshold}")
print(f"Weaker Negative threshold with a {weak_percentile}th percent win rate: {weak_neg_threshold}")

plot_prediction_vs_actual(test_preds, test_labels, pos_threshold=strong_pos_threshold, neg_threshold=strong_neg_threshold, title='In-Sample Predictions vs Actual Results')

# Check the primary features that impacted the model
extract_feature_importance(final_pipeline, X, clustering=clustering, top_n=20)

# Show heatmap of the predictions ontop of a price plot
# data.close.vbt.overlay_with_heatmap(test_preds).show_svg()

# Simulate a portfolio
insample_pf = simulate_pf(
    data=data, test_preds=test_preds, open_long_th=strong_pos_threshold, close_long_th=weak_pos_threshold, close_short_th=weak_neg_threshold, open_short_th=strong_neg_threshold, plot=False)

# Simulate the portfolio with a time delta stop
print('\n=========================Time Delta Portfolio=========================\n')

timedelta_pf = vbt.Portfolio.from_signals(
    data.close[test_preds.index],
    entries             =test_preds > strong_pos_threshold,
    # exits               =test_preds < 0,
    # short_entries       =test_preds < strong_neg_threshold,
    # short_exits         =test_preds > 0,
    td_stop             = prediction_window,
    time_delta_format   ='Rows',
)

print(timedelta_pf.stats())

In [None]:
timedelta_pf = vbt.Portfolio.from_signals(
    data.close[test_preds.index],
    entries             = test_preds > .0,
    # exits               =test_preds < 0,
    # short_entries       = test_preds < -0.01,
    # short_exits         =test_preds > 0,
    td_stop             = prediction_window,
    time_delta_format   ='Rows',
)

print(timedelta_pf.stats())

In [None]:
timedelta_pf.plot().show()

In [None]:
strong_pos_threshold, strong_neg_threshold= find_optimal_thresholds(y_true=test_labels, y_pred=test_preds)

plot_prediction_vs_actual(test_preds, test_labels, pos_threshold=strong_pos_threshold, neg_threshold=strong_neg_threshold, title='In-Sample Predictions vs Actual Results')


In [None]:
percentile = 70
df = pd.DataFrame({
    'y_true': test_labels,
    'y_pred': test_preds
})
df.sort_values(by='y_true', inplace=True)
df['cumulative_positives'] = (df['y_true'] > 0).cumsum()
df['cumulative_count'] = 
# Calculate the cumulative sum of positive true values and the total count

# Find the first point where the ratio of cumulative positives to count is >= percentile
pos_index = df[df['cumulative_positives'] / df['cumulative_count'] >= percentile / 100.0].first_valid_index()
neg_index = df[df['cumulative_positives'] / df['cumulative_count'] <= (100 - percentile) / 100.0].last_valid_index()

pos_threshold = df.loc[pos_index, 'y_pred'] if pos_index else None
neg_threshold = df.loc[neg_index, 'y_pred'] if neg_index else None

print(f"Positive threshold with a {percentile}th percent win rate : {pos_threshold} ")


In [None]:
import pandas as pd
df = pd.DataFrame({
    'y_true': test_labels,
    'y_pred': test_preds
})

# Filter rows with x_coordinate values less than 0
filtered_df = df[df['y_pred'] < 0]
# Get the unique x-coordinate values from the filtered DataFrame and sort them
unique_x_values = sorted(filtered_df['y_pred'].unique())
# Binary search function
def binary_search_for_ratio(unique_x, df):
    left = 0
    right = len(unique_x) - 1
    
    while left <= right:
        mid = left + (right - left) // 2
        
        filtered_x = df[df['y_pred'] == unique_x[mid]]
        y_gt_zero = (filtered_x['y_true'] > 0).sum()
        y_lt_zero = (filtered_x['y_true'] < 0).sum()
        # Avoid division by zero
        if y_lt_zero == 0:
            return None
        
        ratio = y_gt_zero / y_lt_zero
        if ratio == 1.0:
            return unique_x[mid]
        elif ratio > 1.0:
            right = mid - 1
        else:
            left = mid + 1
    
    return None
# Perform binary search
x_coordinate_ratio_1 = binary_search_for_ratio(unique_x_values, filtered_df)
print("x-coordinate where the ratio is 1.0:", x_coordinate_ratio_1)

In [None]:
true_positives = df[(df['y_pred']>0) & (df['y_true']>0)]
false_positives = df[(df['y_pred']>0) & (df['y_true']<0)]

# Show the cutoff where the model is 70% accurate
true_positives.sort_values(by='y_pred', ascending=False, inplace=True)
true_positives['total_true_positives'] = (len(true_positives))
true_positives['true_pos_rank'] 


In [None]:
insample_pf.plot().show()
timedelta_pf.plot().show()

# Rolling CV
This takes quite a bit of time and I'm not sure the juice is worth the sqeeze.

In [None]:
# Future Prediction Length
periods_future = 150
pivot_up_th = 0.01
pivot_down_th = 0.01
clustering = False

# Establish the training and testing sets for cross validation
windowsize = 10000
split = 0.90
gap = periods_future

# Prep Data
X, y = prepare_data(data, pivot_up_th=pivot_up_th, pivot_down_th=pivot_down_th, periods_future=periods_future) # in-sample
# print(X)
Xoos, yoos = prepare_data(outofsample_data, pivot_up_th=pivot_up_th, pivot_down_th=pivot_down_th, periods_future=periods_future) # out-of-sample
# print(X.columns)
# Set up the pipeline and create the cross validation splits
pipeline = create_pipeline(X, model='xgb')
cv_splitter, cv = create_rolling_cv_with_gap(X, length=windowsize, split=split, gap=periods_future, set_labels=["train", "test"])

# Train and cross-validate
final_pipeline, test_labels, test_preds = cross_validate_and_train(
    pipeline, X, y, cv_splitter, model_name="In-Sample", verbose_interval=50, clustering=clustering)

# Evaluate
mse, rmse, mae, r2 = evaluate_predictions(test_labels, test_preds, model_name="In-Sample")

# Check the scatter plot of predictions vs actual results
plot_prediction_vs_actual(test_preds, test_labels, title='In-Sample Predictions vs Actual Results')
# Check the primary features that impacted the model
extract_feature_importance(final_pipeline, X, clustering=clustering, top_n=20)

# Show heatmap of the predictions ontop of a price plot
# data.close.vbt.overlay_with_heatmap(test_preds).show_svg()

# Simulate a portfolio
insample_pf = simulate_pf(data=data, test_preds=test_preds, open_long_th=0.03, close_long_th=0.0, close_short_th=-0.03, open_short_th=-0.01, plot=False)


In [None]:
# Check the primary features that impacted the model
extract_feature_importance(final_pipeline, X, clustering=clustering, top_n=10)

In [None]:
insample_pf.plot().show()
timedelta_pf.plot().show()

In [None]:
data.close[test_preds.index]
test_preds.describe()

In [None]:
test_labels.describe()

In [None]:
insample_pf = vbt.Portfolio.from_signals(
    data.close[test_preds.index],  # use only the test set
    entries             =test_preds > strong_pos_threshold,
    # exits               =test_preds < 0,
    short_entries       =test_preds < strong_neg_threshold,
    # short_exits         =test_preds > 0,
    td_stop             =prediction_window,
    time_delta_format   ='Rows',
)
print(insample_pf.stats())
    

# Rolling CV
This takes quite a bit of time and I'm not sure the juice is worth the sqeeze.

In [None]:
# Future Prediction Length
periods_future = 150
pivot_up_th = 0.01
pivot_down_th = 0.01
clustering = False

# Establish the training and testing sets for cross validation
windowsize = 10000
split = 0.90
gap = periods_future

# Prep Data
X, y = prepare_data(data, pivot_up_th=pivot_up_th, pivot_down_th=pivot_down_th, periods_future=periods_future) # in-sample
# print(X)
Xoos, yoos = prepare_data(outofsample_data, pivot_up_th=pivot_up_th, pivot_down_th=pivot_down_th, periods_future=periods_future) # out-of-sample
# print(X.columns)
# Set up the pipeline and create the cross validation splits
pipeline = create_pipeline(X, model='xgb')
cv_splitter, cv = create_rolling_cv_with_gap(X, length=windowsize, split=split, gap=periods_future, set_labels=["train", "test"])

# Train and cross-validate
final_pipeline, test_labels, test_preds = cross_validate_and_train(
    pipeline, X, y, cv_splitter, model_name="In-Sample", verbose_interval=50, clustering=clustering)

# Evaluate
mse, rmse, mae, r2 = evaluate_predictions(test_labels, test_preds, model_name="In-Sample")

# Check the scatter plot of predictions vs actual results
plot_prediction_vs_actual(test_preds, test_labels, title='In-Sample Predictions vs Actual Results')
# Check the primary features that impacted the model
extract_feature_importance(final_pipeline, X, clustering=clustering)

# Show heatmap of the predictions ontop of a price plot
# data.close.vbt.overlay_with_heatmap(test_preds).show_svg()

# Simulate a portfolio
insample_pf = simulate_pf(data=data, test_preds=test_preds, open_long_th=0.03, close_long_th=0.0, close_short_th=-0.03, open_short_th=-0.01, plot=False)


Meta Model

In [None]:
metaX, metay = prepare_data(
    data, base_predictions=test_preds, meta=True, pivot_up_th=pivot_up_th, pivot_down_th=pivot_down_th, periods_future=periods_future) # in-sample

# Set up the pipeline and create the cross validation splits
pipeline = create_pipeline(metaX, model='xgb', task='classification')
cv_splitter, cv = create_cv_with_gap(metaX, min_length=windowsize, test_amount=test_amount, gap=periods_future, set_labels=["train", "test"])

# Train and cross-validate
final_pipeline, meta_test_labels, meta_test_preds = cross_validate_and_train(
    pipeline, metaX, metay, cv_splitter, model_name="In-Sample", verbose_interval=50, clustering=clustering)

# Evaluate
acc, prec, recall, f1, auc_roc = evaluate_predictions(meta_test_labels, meta_test_preds, meta=True, model_name="Meta Labeling In-Sample")

In [None]:
meta_test_preds

In [None]:
# strong_percentile = 98 # Enter a trade when the model predicts a return in the top 1% of all returns
# weak_percentile = 95 # Exit a trade when the model predicts a return in the top 5% of all returns
# strong_pos_threshold, strong_neg_threshold = find_optimal_thresholds(y_true=meta_test_labels, y_pred=meta_test_preds, percentile=strong_percentile)
# print(f"Positive threshold with a {strong_percentile}th percent win rate : {strong_pos_threshold}")
# print(f"Negative threshold with a {strong_percentile}th percent win rate: {strong_neg_threshold}")

# weaker_pos_threshold, weaker_neg_threshold = find_optimal_thresholds(y_true=meta_test_labels, y_pred=meta_test_preds, percentile=weak_percentile)
# print(f"Weaker Positive threshold with a {weak_percentile}th percent win rate : {weaker_pos_threshold}")
# print(f"Weaker Negative threshold with a {weak_percentile}th percent win rate: {weaker_neg_threshold}")

# plot_prediction_vs_actual(meta_test_preds, meta_test_labels, pos_threshold=strong_pos_threshold, neg_threshold=strong_neg_threshold, title='In-Sample Predictions vs Actual Results')

# Check the primary features that impacted the model
extract_feature_importance(final_pipeline, metaX, clustering=clustering)

# Show heatmap of the predictions ontop of a price plot
# data.close.vbt.overlay_with_heatmap(test_preds).show_svg()

# # Simulate a portfolio
# insample_pf = simulate_pf(
#     data=data, test_preds=test_preds, open_long_th=strong_pos_threshold, close_long_th=weaker_pos_threshold, close_short_th=weaker_neg_threshold, open_short_th=strong_neg_threshold, plot=False)


In [None]:
extract_feature_importance(final_pipeline, X)

In [None]:


# Combine the test predictions and labels into a single array
test_data = np.column_stack((test_preds, test_labels))
n_clusters = 12
# Run KMeans clustering with 6 clusters
kmeans = KMeans(n_clusters=n_clusters, random_state=0)

# Create a pipeline with all the data
pipeline_all = create_pipeline(X, model='xgb')

# Evaluate the pipeline with cross-validation
scores_all = cross_val_score(pipeline_all, X, y, cv=50, scoring='neg_mean_squared_error')
print(f'All data scores: {scores_all}')

# Evaluate the pipeline with cross-validation using the clustering step
scores_filtered = []
for train_index, test_index in KFold(n_splits=50).split(X):
    print(f'Fold {len(scores_filtered) + 1}')
    # Fit the clustering algorithm on the training data
    kmeans.fit(np.column_stack((y[train_index], pipeline_all.predict(X[train_index]).reshape(-1, 1))))

    # Get the cluster labels for the test data
    cluster_labels = kmeans.predict(np.column_stack((y[test_index], pipeline_all.predict(X[test_index]).reshape(-1, 1))))

    # Filter out the data points in the first cluster
    filtered_data = X[test_index][cluster_labels != 0]
    filtered_labels = y[test_index][cluster_labels != 0]

    # Evaluate the pipeline on the filtered data
    score = pipeline_all.score(filtered_data, filtered_labels)
    scores_filtered.append(score)

print(f'Filtered data scores: {scores_filtered}')

# Plot the original test data
plt.scatter(test_labels, test_preds, alpha=0.5)

# Loop through the unique cluster labels and plot the data points for each cluster with a different color
for label in np.unique(cluster_labels):
    filtered_data = test_data[cluster_labels == label]
    plt.scatter(filtered_data[:, 1], filtered_data[:, 0], alpha=0.5, label=f'Cluster {label}')

# Add a legend, axis labels, and a title
plt.legend()
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.title('Test Predictions vs Actuals')

# Show the plot
plt.show()

Save the model trained up to 2023 on "in sample" data using cross validation

In [None]:

filename = 'models/model_upto_2023_rolling.joblib'
dump(final_pipeline, filename)

### Load the model from storage and train on out of sample data

### Walk Forward Cross Validation on Out of sample data
- Load the model
- Preprocess the data
- Create Cross Validations for training and testing on newly seen data
- Train the model
- Make predictions
- Test and evaluate the model

In [None]:
# filename = 'models/model_upto_2023_rolling.joblib'
# # Load the model from the .joblib file
# final_pipeline = load(filename) 

# Create your cross validation splits
cv_splitter_oos, cv_oos = create_rolling_cv_with_gap(Xoos, length=windowsize, split=split, gap=periods_future, set_labels=["train", "test"])

# Train and cross-validate
final_pipeline, oos_test_labels, oos_test_preds = cross_validate_and_train(final_pipeline, Xoos, yoos, cv_splitter_oos, model_name="Out-Of-Sample", verbose_interval=10)

# Evaluate
mse, rmse, mae, r2 = evaluate_predictions(oos_test_labels, oos_test_preds, model_name="Out-Of-Sample")

# plot the scatterplot
plot_prediction_vs_actual(oos_test_preds, oos_test_labels, title='Out-Of-Sample Predictions vs Actual Results')
extract_feature_importance(final_pipeline, X)


In [None]:

oos_retraining_pf = vbt.Portfolio.from_signals(
    outofsample_data.close[oos_test_preds.index], # use only the test set
    entries         = oos_test_preds > 0.005, # long entry when prediction is greater than X%
    exits           = oos_test_preds < 0.00, # exit long when prediction is negative
    short_entries   = oos_test_preds < -0.005, # enter short when prediction is less than -X%
    short_exits     = oos_test_preds > 0.0, # exit short when prediction is positive
    # direction="both" # long and short
)
print(oos_retraining_pf.stats())
oos_retraining_pf.plot().show()



In [None]:
oos_retraining_pf.trades.records_readable

In [None]:
# Save the model
total_filename = 'models/out_of_sample_rolling.joblib'
dump(final_pipeline, total_filename)

### Simulate a portfolio in 2023 with retraining the model every 200 bars

In [None]:
oos_test_preds.index

In [None]:
fig = outofsample_data.close[oos_test_preds.index].vbt.plot()
outofsample_data.close.vbt.plot(fig=fig).show()
data.close.vbt.overlay_with_heatmap(test_preds).show()

In [None]:
# insample_pf.orders.records_readable   

In [None]:
# insample_pf.trades.records_readable

# Combine insample with out of sample

In [None]:
fig = insample_pf.cumulative_returns.vbt.plot(trace_kwargs=dict(name='Insample')) # plot the in sample equity curve from test data not trained data
oos = insample_pf.cumulative_returns[-1] *(1+ oos_retraining_pf.returns).cumprod() # append the out of sample equity curve to the in sample equity curve
# Add the out of sample equity curve to the plot
oos.vbt.plot(fig=fig, trace_kwargs=dict(name='Out of Sample'))
normalized_price = data.close/data.close[0]
oos_normalized_price = outofsample_data.close/outofsample_data.close[0] * normalized_price[-1] # normalize the out of sample data to the last price of the in sample data
normalized_price.rename('Normalized Price').vbt.plot(fig=fig)
oos_normalized_price.rename('Out of Sample Normalized Price').vbt.plot(fig=fig)
# The gap is the warmup period for the new model to start making predictions

## Save everything to the models folder for later analysis

In [None]:
test_preds_vs_actuals = pd.concat([test_preds, test_labels], axis=1)
test_preds_vs_actuals.columns = ['Predictions', 'Actuals']
oos_test_preds_vs_actuals = pd.concat([oos_test_preds, oos_test_labels], axis=1)
oos_test_preds_vs_actuals.columns = ['Predictions', 'Actuals']

In [None]:

oos_test_preds_vs_actuals.tail(50)

In [None]:
insample_pf.save('models/insample_test_portfolio_rolling.pkl')
insample_pf.stats().to_csv('models/insample_stats_test_rolling.csv')
insample_pf.trades.records_readable.to_csv('models/insample_trades_test_rolling.csv')
X.to_csv('models/insample_X_test_rolling.csv')
y.to_csv('models/insample_y_test_rolling.csv')
Xoos.to_csv('models/oos_X_test_rolling.csv')
yoos.to_csv('models/oos_y_test_rolling.csv')
test_preds_vs_actuals.to_csv('models/insample_preds_vs_actuals_test_rolling.csv')
oos_test_preds_vs_actuals.to_csv('models/oos_preds_vs_actuals_test_rolling.csv')
oos_retraining_pf.save('models/oos_retrained_portfolio_rolling.pkl')
oos_retraining_pf.stats().to_csv('models/oos_retrained_stats_rolling.csv')
oos_retraining_pf.trades.records_readable.to_csv('models/oos_retrained_trades_rolling.csv')

# Explore which features are impacting the model

In [None]:

        
extract_feature_importance(final_pipeline, X)



A lot to unpack up above. Why are the feature scores so much different than the fscores of the features?

# Hyperparameter Tuning

### Randomized Search

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform, randint

# Specify hyperparameters to tune and their respective distributions
param_dist = {
    'model__learning_rate': uniform(0.01, 0.2),
    'model__n_estimators': randint(100, 1000),
    'model__max_depth': randint(3, 10),
    'model__min_child_weight': randint(1, 10),
    'model__subsample': uniform(0.5, 0.5),
    'model__colsample_bytree': uniform(0.5, 0.5),
    # add other parameters here
}

# Perform randomized search
random_search = RandomizedSearchCV(pipeline, param_dist, n_iter=10, cv=cv, scoring="neg_mean_squared_error", n_jobs=-1, verbose=10, random_state=42)
random_search.fit(X, y)

# Best parameters and score from random search
print(f"Best parameters: {random_search.best_params_}")
print(f"Best score: {random_search.best_score_}")


In [None]:
cv_splitter, cv = create_rolling_cv_with_gap(X, length=windowsize, split=split, gap=gap, set_labels=['train', 'test'])

# The slices are obtained by using your cv_splitter on your X and y.
X_slices = cv_splitter.take(X)
y_slices = cv_splitter.take(y)

# Fit and predict with the best estimator
test_labels = []
test_preds = []
for split in X_slices.index.unique(level="split"):  
    X_train_slice = X_slices[(split, "train")]  
    y_train_slice = y_slices[(split, "train")]
    X_test_slice = X_slices[(split, "test")]
    y_test_slice = y_slices[(split, "test")]

    slice_pipeline = random_search.best_estimator_.fit(X_train_slice, y_train_slice)  # uses the best estimator from the random search
    test_pred = slice_pipeline.predict(X_test_slice)  
    test_pred = pd.Series(test_pred, index=y_test_slice.index)
    test_labels.append(y_test_slice)
    test_preds.append(test_pred)
    print(f"MSE for split {split}: {mean_squared_error(y_test_slice, test_pred)}")


test_labels = pd.concat(test_labels).rename("labels")  
test_preds = pd.concat(test_preds).rename("preds")

# Show the accuracy of the predictions
# Assuming test_labels and test_preds are your true and predicted values
mse = mean_squared_error(test_labels, test_preds)
rmse = np.sqrt(mse)  # or use mean_squared_error with squared=False
mae = mean_absolute_error(test_labels, test_preds)
r2 = r2_score(test_labels, test_preds)

print(f"Mean Squared Error (MSE): {mse}")
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"Mean Absolute Error (MAE): {mae}")
print(f"R-squared: {r2}")

# Visualize the predictions as a heatmap plotted against the price
# data.close.vbt.overlay_with_heatmap(test_preds).show_svg()

In [None]:
# Save the model with the best parameters
import json

# Save the model with the best parameters
dump(random_search.best_estimator_, 'models/xgboost_best_estimator_rolling.joblib')

# Save best params dictionary 
with open('models/xgboost_best_params_rolling.json', 'w') as fp:
    json.dump(random_search.best_params_, fp)


In [None]:

hyperopt_pf = vbt.Portfolio.from_signals(
    data.close[test_preds.index],  # use only the test set
    entries         = test_preds > 0.04,  # long when probability of price increase is greater than 2%
    exits           = test_preds < 0.00,  # long when probability of price increase is greater than 2%
    short_entries   = test_preds < -0.04,  # long when probability of price increase is greater than 2%
    short_exits     = test_preds > 0.0,  # short when probability prediction is less than -5%
    # direction="both" # long and short
)
print(hyperopt_pf.stats())
hyperopt_pf.plot().show()

In [None]:
hyperopt_pf.save('models/hyperopt_portfolio_rolling.pkl')
hyperopt_pf.trades.records_readable.to_csv('models/hyperopt_trades_rolling.csv')
hyperopt_pf.orders.records_readable.to_csv('models/hyperopt_orders_rolling.csv')
test_preds.to_csv('models/hyperopt_preds_rolling.csv')


### Grid Search Method
#### DONT RUN WITHOUT GPU

In [None]:
# from sklearn.model_selection import GridSearchCV

# # Specify hyperparameters to tune and their respective ranges
# param_grid = {
#     'model__learning_rate': [0.01, 0.1, 0.2],
#     'model__n_estimators': [100, 500, 1000],
#     'model__max_depth': [3, 5, 7],
#     'model__min_child_weight': [1, 5, 10],
#     'model__subsample': [0.5, 0.7, 1.0],
#     'model__colsample_bytree': [0.5, 0.7, 1.0]
#     # add other parameters here
# }

# # Perform grid search
# grid_search = GridSearchCV(pipeline, param_grid, cv=cv, scoring="r2", n_jobs=-1, verbose=10)
# grid_search.fit(X, y)

# # Best parameters and score from grid search
# print(f"Best parameters: {grid_search.best_params_}")
# print(f"Best score: {grid_search.best_score_}")


In [None]:
# cv_splitter, cv = create_rolling_cv(X, length=200, split=.9)

# # The slices are obtained by using your cv_splitter on your X and y.
# X_slices = cv_splitter.take(X)
# y_slices = cv_splitter.take(y)

# # Here, we train the model using the slices and the best estimator from your RandomizedSearchCV
# test_labels, test_preds, final_pipeline = cross_validate_and_train(random_search.best_estimator_, X_slices, y_slices, cv_splitter, model_name="Random Search Best Estimator")

# # And now we evaluate the predictions.
# mse, rmse, mae, r2 = evaluate_predictions(test_labels, test_preds, model_name="Random Search Best Estimator")


In [None]:
# grid_pf = vbt.Portfolio.from_signals(
#     data.close[test_preds.index], # use only the test set
#     entries         = test_preds > 0.05, # long when prediction > X%
#     exits           = test_preds < 0.00, # exit when prediction is negative
#     short_entries   = test_preds < -0.05, # short when prediction < -X%
#     short_exits     = test_preds > 0.00, # exit when prediction is positive
# )
# print(grid_pf.stats())
# grid_pf.plot().show()
