# Find Decision Boundary for Test DataFrames

In the previous notebooks, we have generated both the train and test dataframes, and the XGBoost model.

We need to run the model in the test data, in order to check if the generated model will generate successfull operations.

This notebook will also get the XGBoost Models text dump and generate python code that will be used in the FastAPI Server.

In [None]:
SHAP_ENABLED=False # For this notebook, we won't be using SHAP

In [None]:
import os
import sys
import gc
import pandas as pd
import joblib
if SHAP_ENABLED:
    import shap
from bokeh.resources import INLINE
from bokeh.io import output_notebook
import matplotlib.pyplot as plt

# Set the syspath in order to import the regular pyautotrader module.
to_append = os.getcwd() + os.sep + '..' + os.sep + '..' + os.sep + '..' + os.sep + '..' + os.sep + '..'
print(to_append)
sys.path.append(to_append)

from pyautotrader.utils.model_export import export_model_python, create_ast_from_xgboost_dump

In [None]:
import platform

SHOULD_GENERATE_IMAGES = platform.system() == 'Windows'
if "SHOULD_GENERATE_IMAGES" in os.environ:
    SHOULD_GENERATE_IMAGES = True

In [None]:
output_notebook(INLINE)

Load the Models

In [None]:
DATA_OUTPUT_DIR = os.path.join('..','00.data','output')

In [None]:
if "DATA_OUTPUT_DIR" in os.environ:
    DATA_OUTPUT_DIR = os.environ["DATA_OUTPUT_DIR"]

In [None]:
files_found = [x for x in os.listdir(DATA_OUTPUT_DIR) if x.endswith('.pickle')]
files_found_tokens=[x.split('.') for x in files_found]

models_found = {}

for current_model in files_found_tokens:
    model_name = '.'.join(current_model[:-2])
    filename = os.path.join(DATA_OUTPUT_DIR,'.'.join(current_model))
    if not model_name in models_found:
        models_found[model_name]= {}          
    models_found[model_name][current_model[-2]]=filename

print(models_found)

We will need to load the total dataframe, the parameters dataframe and the raw dataframe, alongside the short and long models

In [None]:
first_model = list(models_found.keys())[0]
current_total_dataset = joblib.load(models_found[first_model]['total'])
current_parameters = joblib.load(models_found[first_model]['parameters'])
current_raw_dataset = joblib.load(models_found[first_model]['raw'])

best_short_booster = joblib.load(models_found[first_model]['xgboostshortmodel'])
best_long_booster = joblib.load(models_found[first_model]['xgboostlongmodel'])

Configure the parameters

In [None]:
CURRENT_EXCHANGE = current_parameters['CURRENT_EXCHANGE']
CURRENT_ASSET = current_parameters['CURRENT_ASSET']
CURRENT_TIMEFRAME = current_parameters['CURRENT_TIMEFRAME']
CURRENT_TARGET = current_parameters['CURRENT_TARGET']
CURRENT_STOP = current_parameters['CURRENT_STOP']
MAX_TRADE_DURATION = current_parameters['MAX_TRADE_DURATION']
DECISION_BOUNDARY = current_parameters['DECISION_BOUNDARY']

Generate the Dataframes

In [None]:
df_current_total_dataset = pd.DataFrame(current_total_dataset)

In [None]:
df_current_total_dataset = df_current_total_dataset[['current_date', 'current_time', 'is_short','is_long'] + current_parameters['CURRENT_X_COLUMNS']]

Some simple functions to run the predictions of the models in all dataframes.

In [None]:
def predict_short(row):
    a = row[current_parameters['CURRENT_X_COLUMNS']].to_numpy().reshape(1,-1)
    return best_short_booster.get_booster().inplace_predict(a)[0]

def predict_long(row):
    a = row[current_parameters['CURRENT_X_COLUMNS']].to_numpy().reshape(1,-1)
    return best_long_booster.get_booster().inplace_predict(a)[0]

df_current_total_dataset['short_predict'] = df_current_total_dataset.apply( lambda row: predict_short(row), axis=1)
df_current_total_dataset['long_predict'] = df_current_total_dataset.apply( lambda row: predict_long(row), axis=1)
df_current_total_dataset['short_cost'] = df_current_total_dataset.apply(  lambda row: ((1 if row['short_predict'] >= DECISION_BOUNDARY else 0) - row['is_short'])**2, axis=1)
df_current_total_dataset['long_cost'] = df_current_total_dataset.apply(  lambda row: ((1 if row['long_predict'] >= DECISION_BOUNDARY else 0) - row['is_long'])**2, axis=1)

We will now create a excel spreadsheet with the generated predictions.

In [None]:
check_file_name = f"{CURRENT_EXCHANGE}.{CURRENT_ASSET}.{CURRENT_TIMEFRAME}.{int(CURRENT_TARGET * 100)}.{int(CURRENT_STOP * 100)}.check_model.xlsx"
check_full_file_name = os.path.join(DATA_OUTPUT_DIR,check_file_name)

df_check_predict = df_current_total_dataset[['current_date', 'current_time','is_short','is_long', 'short_predict','long_predict', 'short_cost','long_cost']]
df_check_predict.to_excel(check_full_file_name)

Clear some memory using the garbage collector

In [None]:
current_total_dataset = None
df_check_predict = None
gc.collect()

In [None]:
def predict_shap_short(row):
    x_columns = row[current_parameters['CURRENT_X_COLUMNS']].to_numpy().reshape(1,-1)
    explainer = shap.TreeExplainer(best_short_booster)
    shap_values = explainer.shap_values(x_columns)
    shap_values = shap_values[0]
    shap_values_with_desc = []
    for current_column in range(len(current_parameters['CURRENT_X_COLUMNS'])):
        shap_values_with_desc.append({ 'desc':current_parameters['CURRENT_X_COLUMNS'][current_column], 'value':shap_values[current_column]})

    shap_values_with_desc.sort(key=lambda x: x['value'])
    return str(shap_values_with_desc)
    

def predict_shap_long(row):
    x_columns = row[current_parameters['CURRENT_X_COLUMNS']].to_numpy().reshape(1,-1)
    explainer = shap.TreeExplainer(best_long_booster)
    shap_values = explainer.shap_values(x_columns)
    shap_values = shap_values[0]
    shap_values_with_desc = []
    for current_column in range(len(current_parameters['CURRENT_X_COLUMNS'])):
        shap_values_with_desc.append({ 'desc':current_parameters['CURRENT_X_COLUMNS'][current_column], 'value':shap_values[current_column]})

    shap_values_with_desc.sort(key=lambda x: x['value'])
    return str(shap_values_with_desc)

    
df_current_total_dataset['short_shap'] = df_current_total_dataset.apply( lambda row: predict_shap_short(row) if (row['short_predict'] > 0 and SHAP_ENABLED) else "", axis=1)
df_current_total_dataset['long_shap'] = df_current_total_dataset.apply( lambda row: predict_shap_long(row) if (row['long_predict'] > 0 and SHAP_ENABLED) else "", axis=1)

Let us show some predictions, just for testing purposes

In [None]:
df_current_total_dataset[df_current_total_dataset['short_predict'] > 0]['short_shap'].head(10)

We will create a histogram of all prediction values for all frames using the short model

In [None]:
model_file_name = os.path.join(f"{CURRENT_EXCHANGE}.{CURRENT_ASSET}.{CURRENT_TIMEFRAME}.{int(CURRENT_TARGET * 100)}.{int(CURRENT_STOP * 100)}.hist_short.png")
model_full_file_name = os.path.join(DATA_OUTPUT_DIR,model_file_name)

fig, ax = plt.subplots()
df_current_total_dataset.hist('short_predict', ax=ax, bins=500)
fig.savefig(model_full_file_name)

We will create a histogram of all prediction values for all frames using the short model

In [None]:
model_file_name = os.path.join(f"{CURRENT_EXCHANGE}.{CURRENT_ASSET}.{CURRENT_TIMEFRAME}.{int(CURRENT_TARGET * 100)}.{int(CURRENT_STOP * 100)}.hist_long.png")
model_full_file_name = os.path.join(DATA_OUTPUT_DIR,model_file_name)

fig, ax = plt.subplots()
df_current_total_dataset.hist('long_predict', ax=ax, bins=500)
fig.savefig(model_full_file_name)

We are going now to generate the trades, and we need to create a separate dataframe will all the predicted data.

In [None]:
results_from_df = df_current_total_dataset.to_dict('records')
results = {}
short_results = []
long_results = []

for current_result in results_from_df:
    results[(current_result['current_date'] * 10000) + current_result['current_time']] = \
    { 
        'short_predict' : current_result['short_predict'],
        'long_predict' : current_result['long_predict'],
        'short_shap' : current_result['short_shap'],
        'long_shap' : current_result['long_shap'],
    }    
    short_results.append(current_result['short_predict'])
    long_results.append(current_result['long_predict'])




So, now we are going to loop through the dataframe, and checking if a certain candle generated a trade or not, and then checking its final result. Please notice that we might need to add some risk management like the handling of the stops and so on. 

After a trade is initiated, it will be executed until it reaches its Gain Target or the Stop loss or it has reached MAX_TRADE_DURATION.

In [None]:
minimum_short_predict = 0
minimum_long_predict = 0

maximum_short_predict = int(round(max(short_results),0))
maximum_long_predict = int(round(max(long_results),0))

current_short_predict = 0
current_long_predict = 0

current_trade = None
current_trade_entries = []
processed_dates = {}

candle_count = 0
current_target = current_parameters['CURRENT_TARGET']
current_stop = current_parameters['CURRENT_STOP']

for current_candle in current_raw_dataset:
    if current_trade is not None:
        is_short = current_trade['trade_type'] == 'short'
        is_long = current_trade['trade_type'] == 'long'
        if is_short:
            if current_candle['low'] <= current_trade['trade_target']:
                current_trade['result'] =  round(current_trade['trade_start'] - current_trade['trade_target'],2)
                current_trade['final_close'] = current_trade['trade_target']
                current_trade = None
                continue
            if current_candle['high'] >= current_trade['trade_stop']:
                current_trade['result'] =  round(current_trade['trade_start'] - current_trade['trade_stop'],2)
                current_trade['final_close'] = current_trade['trade_stop']
                current_trade = None
                continue                
        if is_long:
            if current_candle['low'] <= current_trade['trade_stop']:
                current_trade['result'] =  round(current_trade['trade_start'] - current_trade['trade_stop'],2)
                current_trade['final_close'] = current_trade['trade_stop']
                current_trade = None
                continue
            if current_candle['high'] >= current_trade['trade_target']:
                current_trade['result'] =  round(current_trade['trade_target']- current_trade['trade_target'],2)
                current_trade['final_close'] = current_trade['trade_target']
                current_trade = None
                continue                
        if candle_count > (current_trade['start_candle'] + MAX_TRADE_DURATION):
            if is_long:
                current_trade['result'] =  round(current_candle['close'] - current_trade['trade_start'],2)
                current_trade['final_close'] = current_candle['close']
            if is_short:
                current_trade['result'] =  round(current_trade['trade_start'] - current_candle['close'],2)
                current_trade['final_close'] = current_candle['close']
            current_trade = None
            continue

        
    if current_trade is None and (current_candle['Date'] not in processed_dates):
        if current_candle['Date'] > current_parameters['MINIMUM_DATE_TRADE'] and \
           current_candle['Time'] >= current_parameters['MINIMUM_TIME'] and \
           current_candle['Time'] <= current_parameters['MAXIMUM_TIME']:
            current_date_time = (current_candle['Date'] * 10000) + current_candle['Time']
            is_entry_point = results[current_date_time]['short_predict'] > DECISION_BOUNDARY or results[current_date_time]['long_predict'] > DECISION_BOUNDARY
            if is_entry_point:
                is_short = results[current_date_time]['short_predict'] > 0 
                current_trade = {**current_candle, 
                                 'trade_type': 'short' if is_short else 'long', 
                                 'start_candle': candle_count,
                                 'trade_start': current_candle['close'],
                                 'predicted': results[current_date_time]['short_predict'] if is_short else results[current_date_time]['long_predict'],
                                 'shap': results[current_date_time]['short_shap'] if is_short else results[current_date_time]['long_shap']
                                }
                if is_short:
                    current_trade['trade_target'] = current_candle['close'] * (1 - (current_target / 100))
                    current_trade['trade_stop'] = current_candle['close'] * (1 + (current_stop / 100))
                else:
                    current_trade['trade_target'] = current_candle['close'] * (1 + (current_target / 100))
                    current_trade['trade_stop'] = current_candle['close'] * (1 - (current_stop / 100))
                    
                processed_dates[current_candle['Date']] = '1'
                current_trade_entries.append(current_trade)
                
    candle_count += 1


After we have interated over all the candles in the dataframe, and generated the trades, we can export them to excel

In [None]:
trades = pd.DataFrame(current_trade_entries)
raw_trades_file_name = os.path.join(f"{CURRENT_EXCHANGE}.{CURRENT_ASSET}.{CURRENT_TIMEFRAME}.{int(CURRENT_TARGET * 100)}.{int(CURRENT_STOP * 100)}.trades.xlsx")
raw_trades_full_file_name = os.path.join(DATA_OUTPUT_DIR,raw_trades_file_name)
trades.to_excel(raw_trades_full_file_name)

Now, we export the XGBoost model from the txt dump into python code for the Short Model

In [None]:
model_file_name = os.path.join(f"{CURRENT_EXCHANGE}.{CURRENT_ASSET}.{CURRENT_TIMEFRAME}.{int(CURRENT_TARGET * 100)}.{int(CURRENT_STOP * 100)}.xgboostshortmodel.txt")
model_full_file_name = os.path.join(DATA_OUTPUT_DIR,model_file_name)

python_script_name = 'process_short'
python_script_name_short = f'{CURRENT_EXCHANGE}_{CURRENT_ASSET}_{CURRENT_TIMEFRAME}_{int(CURRENT_TARGET * 100)}_{int(CURRENT_STOP * 100)}_process_short'
python_code_model_full_file_name = os.path.join(DATA_OUTPUT_DIR,python_script_name_short+'.py')

ast = create_ast_from_xgboost_dump(model_full_file_name)
export_model_python(ast, python_script_name, python_code_model_full_file_name, 0.5)

Now, we export the XGBoost model from the txt dump into python code for the Short Model

In [None]:
model_file_name = os.path.join(f"{CURRENT_EXCHANGE}.{CURRENT_ASSET}.{CURRENT_TIMEFRAME}.{int(CURRENT_TARGET * 100)}.{int(CURRENT_STOP * 100)}.xgboostlongmodel.txt")
model_full_file_name = os.path.join(DATA_OUTPUT_DIR,model_file_name)

python_script_name = 'process_long'
python_script_name_long = f'{CURRENT_EXCHANGE}_{CURRENT_ASSET}_{CURRENT_TIMEFRAME}_{int(CURRENT_TARGET * 100)}_{int(CURRENT_STOP * 100)}_process_long'
python_code_model_full_file_name = os.path.join(DATA_OUTPUT_DIR,python_script_name_long+'.py')

ast = create_ast_from_xgboost_dump(model_full_file_name)
export_model_python(ast, python_script_name, python_code_model_full_file_name, 0.5)

We add the output folder to the PYTHONPATH and import the generated code.

In [None]:
sys.path.append(DATA_OUTPUT_DIR)
exec(f'from {python_script_name_short} import process_short')
exec(f'from {python_script_name_long} import process_long')

We now run the inference process again, but using the python code and check it to see if its the same as the one generated from XGBoost Code.

In [None]:
results_from_df = [{ **x, 'short_from_code' : process_short(x), 'long_from_code': process_long(x)} for x in results_from_df]
check_results = pd.DataFrame(results_from_df)
check_results['short_bias'] = check_results.apply( lambda row: 0 if ((row['short_predict'] - row['short_from_code']) < 0.00001) else 0.00001 , axis=1)
check_results['long_bias'] = check_results.apply( lambda row: 0 if ((row['long_predict'] - row['long_from_code']) < 0.00001) else 0.00001 , axis=1)

In [None]:
check_results.head(100)