# xgboost and Tabular data
## update tabular-trees notebook with xgboost and data preperation


See: 
https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html#sklearn.ensemble.GradientBoostingClassifier

This is done with a 15m data set. 

Another approaches are: 
- 1 minute data set (2 std and 0.1% profit)
- more months
- do some descriptive data on MA length and when prices are out of BB range
- extending the dependent variable to more periods, so that the X% rise or fall occurs in the next e.g. 2-5 candles and not in the next candle 

In [20]:
# import data from septembre of 2022 since it was rather stable during this period
import pandas as pd
import numpy as np
import os
iskaggle = os.environ.get('KAGGLE_KERNEL_RUN_TYPE', '')

if iskaggle:
    df = pd.read_csv("/kaggle/input/btcusdt-2023-6-9/btcusdt-2023-6_9.csv", index_col=0).reset_index(drop=True)
else:
    df = pd.read_csv("btc-data/btcusdt-2023-6_9.csv", index_col=0).reset_index(drop=True)


print(df.shape)
df.head(3)

(11716, 6)


Unnamed: 0,time,open,high,low,close,vol
0,2023.06.01 00:00,27103.1,27108.1,27080.6,27096.9,386.675
1,2023.06.01 00:15,27096.9,27096.9,27036.7,27047.0,408.68
2,2023.06.01 00:30,27047.0,27077.4,27041.0,27054.9,275.08


In [21]:
# show that one can calculate the vol and vol_coin numbers from the API call 1
print(386.675*((27108.1+27080.6)/2))

# take average of high and low and then multiply by vol_coin (which is available)
#  vol        vol_coin
#, '200082', '2.81566749'
2.81566749*((71063+71053)/2) 

10476707.786249999


200075.70050442

In [22]:
df.columns

Index(['time', 'open', 'high', 'low', 'close', 'vol'], dtype='object')

In [23]:
len(df.high)

11716

In [24]:
# clear column names of whitespaces etc. for renaming purposes
df.columns = df.columns.str.strip()

# rename vol to vol_coin bc of API call consistency data mapping (vol is in paper currency)
df.rename(columns={"vol": "vol_coin"},inplace=True)


# idx = 0
# new_col = [7, 8, 9]  # can be a list, a Series, an array or a scalar   
# df.insert(loc=idx, column='A', value=new_col)

# add vol of coin in dollar currency
vol = df.vol_coin * ((df.high+df.low)/2)

df.insert(loc = 5, column = "vol",  value=vol)


In [25]:
df.head(1) # now the mapping is in line with the API call structure 

Unnamed: 0,time,open,high,low,close,vol,vol_coin
0,2023.06.01 00:00,27103.1,27108.1,27080.6,27096.9,10476710.0,386.675


In [26]:
# transform to date_time
df["time"] = pd.to_datetime(df["time"])

# Extracting day and hour
df["day"] = df["time"].dt.day
df["hour"] = df["time"].dt.hour

df.head(1)

Unnamed: 0,time,open,high,low,close,vol,vol_coin,day,hour
0,2023-06-01,27103.1,27108.1,27080.6,27096.9,10476710.0,386.675,1,0


In [32]:
# the 10-day moving average
df["10MA"] = df["close"].rolling(window=10).mean().shift(-9)

# Calculate the standard deviation of the closing prices over the same 20-day period
df["10STD"] = df["close"].rolling(window=10).std().shift(-9)

# volume weighted close price
df["vwap"] = df.close/df.vol

# Bollinger Band: 
# simple_moving_average(20) + std x 2 | sma - std x 2
df["bb_upper_band"] = df["10MA"] + 2 * df["10STD"]
df["bb_lower_band"] = df["10MA"] - 2 * df["10STD"]

# daily_open
# df["daily_open"] = df.groupby(df["day"])["open"].transform("first")

# daily_close
# df["daily_close"] = df.groupby(df["day"])["close"].transform("first")


# Calculate On-Balance Volume (OBV)
df["change"] = df["close"].diff(periods=-1)
df["direction"] = df["change"].apply(lambda x: 1 if x > 0 else (-1 if x < 0 else 0))
df["obv"] = df["vol"] * df["direction"]
df["obv"] = df["obv"].cumsum()

In [28]:
df[final_cols]

Unnamed: 0,time,day,hour,10MA,10STD,bb_upper_band,bb_lower_band
0,2023-06-01 00:00:00,1,0,27117.44,47.868273,27213.176546,27021.703454
1,2023-06-01 00:15:00,1,0,27132.45,62.122858,27256.695715,27008.204285
2,2023-06-01 00:30:00,1,0,27147.65,57.300422,27262.250844,27033.049156
3,2023-06-01 00:45:00,1,0,27172.57,66.008435,27304.586869,27040.553131
4,2023-06-01 01:00:00,1,1,27191.57,65.019160,27321.608321,27061.531679
...,...,...,...,...,...,...,...
11711,2023-09-30 23:00:00,30,23,,,,
11712,2023-09-30 23:15:00,30,23,,,,
11713,2023-09-30 23:30:00,30,23,,,,
11714,2023-09-30 23:45:00,30,23,,,,


In [29]:
cols = ["open","high","low","close","vol","vol_coin","vwap","obv","change","direction"]

shifted_columns = {}

for i in range(9):
    for col in cols: 
        shifted_columns[col+"_"+str(i)] = df[col].shift(i)


# Concatenate the original DataFrame with the new shifted columns DataFrame
final_cols = ['time', 'day', 'hour', '10MA', '10STD', 'bb_upper_band','bb_lower_band']
shifted_df = pd.concat([df[final_cols], pd.DataFrame(shifted_columns)], axis=1)
shifted_df.columns

Index(['time', 'day', 'hour', '10MA', '10STD', 'bb_upper_band',
       'bb_lower_band', 'open_0', 'high_0', 'low_0', 'close_0', 'vol_0',
       'vol_coin_0', 'vwap_0', 'obv_0', 'change_0', 'direction_0', 'open_1',
       'high_1', 'low_1', 'close_1', 'vol_1', 'vol_coin_1', 'vwap_1', 'obv_1',
       'change_1', 'direction_1', 'open_2', 'high_2', 'low_2', 'close_2',
       'vol_2', 'vol_coin_2', 'vwap_2', 'obv_2', 'change_2', 'direction_2',
       'open_3', 'high_3', 'low_3', 'close_3', 'vol_3', 'vol_coin_3', 'vwap_3',
       'obv_3', 'change_3', 'direction_3', 'open_4', 'high_4', 'low_4',
       'close_4', 'vol_4', 'vol_coin_4', 'vwap_4', 'obv_4', 'change_4',
       'direction_4', 'open_5', 'high_5', 'low_5', 'close_5', 'vol_5',
       'vol_coin_5', 'vwap_5', 'obv_5', 'change_5', 'direction_5', 'open_6',
       'high_6', 'low_6', 'close_6', 'vol_6', 'vol_coin_6', 'vwap_6', 'obv_6',
       'change_6', 'direction_6', 'open_7', 'high_7', 'low_7', 'close_7',
       'vol_7', 'vol_coin_7', 'v

In [31]:
print(len(shifted_df.columns))

97


In [24]:
df.columns

Index(['time', 'open', 'high', 'low', 'close', 'vol', 'vol_coin', 'day',
       'hour', '10MA', '10STD', 'vwap', 'bb_upper_band', 'bb_lower_band',
       'change', 'direction', 'obv', 'open_0', 'high_0', 'low_0', 'close_0',
       'vol_0', 'vol_coin_0', 'open_1', 'high_1', 'low_1', 'close_1', 'vol_1',
       'vol_coin_1', 'open_2', 'high_2', 'low_2', 'close_2', 'vol_2',
       'vol_coin_2', 'open_3', 'high_3', 'low_3', 'close_3', 'vol_3',
       'vol_coin_3'],
      dtype='object')

In [16]:
shifted_df.head()

Unnamed: 0,time,day,hour,10MA,10STD,bb_upper_band,bb_lower_band,open_0,high_0,low_0,...,open_9,high_9,low_9,close_9,vol_9,vol_coin_9,vwap_9,obv_9,change_9,direction_9
0,2023-06-01 00:00:00,1,0,27117.44,47.868273,27213.176546,27021.703454,27103.1,27108.1,27080.6,...,,,,,,,,,,
1,2023-06-01 00:15:00,1,0,27132.45,62.122858,27256.695715,27008.204285,27096.9,27096.9,27036.7,...,,,,,,,,,,
2,2023-06-01 00:30:00,1,0,27147.65,57.300422,27262.250844,27033.049156,27047.0,27077.4,27041.0,...,,,,,,,,,,
3,2023-06-01 00:45:00,1,0,27172.57,66.008435,27304.586869,27040.553131,27054.9,27084.0,27054.8,...,,,,,,,,,,
4,2023-06-01 01:00:00,1,1,27191.57,65.01916,27321.608321,27061.531679,27084.0,27113.9,27073.5,...,,,,,,,,,,


## dependent variable

In [33]:

# GOAL:
# if the price is outside of the bollinger bands: 
#    is the next candle 0.5% away from the bb value?

# Create a new column "y_bb" initialized with False
df["y_bb"] = False

# Check conditions and set values accordingly
df.loc[(df["low"] < df["bb_lower_band"]) & (df["high_p1"] > df["bb_lower_band"]*1.002), "y_bb"] = True
df.loc[(df["high"] > df["bb_upper_band"]) & (df["low_p1"] < df["bb_upper_band"]/1.002), "y_bb"] = True

KeyError: 'high_p1'

In [14]:
df.y_bb.sum()

1349

# XGBoost

In [15]:
df.columns

Index(['time', 'open', 'high', 'low', 'close', 'vol', 'vol_coin', 'day',
       'hour', '10MA', '10STD', 'vwap', 'bb_upper_band', 'bb_lower_band',
       'change', 'direction', 'obv', 'open_p1', 'high_p1', 'low_p1',
       'close_p1', 'open_p2', 'high_p2', 'low_p2', 'close_p2', 'open_p3',
       'high_p3', 'low_p3', 'close_p3', 'y_bb'],
      dtype='object')

In [16]:
from sklearn.model_selection import train_test_split

y = dep
X = shifted_df.dropna()

# do the train test splitting 
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=None,shuffle=False) #do the split on variable y to avoid zero y entries

KeyError: "['daily_open', 'daily_close'] not in index"

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

xg = GradientBoostingClassifier() # min_samples_leaf=4
xg.fit(X_train, y_train)

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

y_pred_test = xg.predict(X_test)
accuracy_score(y_test, y_pred_test)

In [None]:
conf_m = confusion_matrix(y_test, y_pred_test)
conf_m

In [None]:
import seaborn as sns
sns.heatmap(conf_m/np.sum(conf_m,axis=0), annot=True, fmt='.2%', cmap='Blues')

In [None]:
print(classification_report(y_test, y_pred_test))

Overall, it seems like a useful model. However it is spurious that the precision is at exactly 75%. 

More ways to improve the model would be to 
- get more data
- add other symbols, like ETH, EUR/USD pair, S&P 500, etc.

In [None]:
pd.DataFrame(dict(cols=X_train.columns, imp=xg.feature_importances_)).sort_values(by=['imp']).plot('cols', 'imp', 'barh');

In [None]:
import pickle

# Save the model to a file
with open('../bb-bot/src/xgboost_model.pkl', 'wb') as file:
    pickle.dump(xg, file)


In [None]:

# Load the model from the file
with open('xgboost_model.pkl', 'rb') as file:
    model = pickle.load(file)


In [None]:
pred = model.predict(X_test.head(1))
pred

In [None]:
X_test.head(20)

# use live data

In [None]:
import requests
import time
import pandas as pd

# Define the endpoint and parameters
base_url = "https://api-testnet.bybit.com"
endpoint = "/v5/market/kline"
symbol = "BTCUSD"  # Example trading pair
interval = "1"    # Kline interval, e.g., "1", "5", "15", "60", "240", etc.
limit = 4        # Number of klines to fetch
start_time = int(time.time()) - 3600 * 24  # Start time in seconds (e.g., 24 hours ago)

# Construct the URL
url = f"{base_url}{endpoint}?category=linear&symbol={symbol}&interval={interval}&limit={limit}"#&start_time={start_time}"

# Make the GET request
response = requests.get(url,headers={}, data={})
data = response.json() 

print(data)
print(pd.Timestamp(data["time"],unit="ms"))
print(data["result"]["list"])

