# xgboost and Tabular data
## update tabular-trees notebook with xgboost and data preperation


See: 
https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html#sklearn.ensemble.GradientBoostingClassifier

This is done with a 15m data set. 

Another approaches are: 
- 1 minute data set (2 std and 0.1% profit)
- more months
- do some descriptive data on MA length and when prices are out of BB range
- extending the dependent variable to more periods, so that the X% rise or fall occurs in the next e.g. 2-5 candles and not in the next candle 

In [1]:
# import data from septembre of 2022 since it was rather stable during this period
import pandas as pd
import numpy as np
import os
iskaggle = os.environ.get('KAGGLE_KERNEL_RUN_TYPE', '')

if iskaggle:
    df = pd.read_csv("/kaggle/input/btcusdt-2023-6-9/btcusdt-2023-6_9.csv", index_col=0).reset_index(drop=True)
else:
    df = pd.read_csv("btc-data/btcusdt-2023-6_9.csv", index_col=0).reset_index(drop=True)


print(df.shape)
df.head(3)

(11716, 6)


Unnamed: 0,time,open,high,low,close,vol
0,2023.06.01 00:00,27103.1,27108.1,27080.6,27096.9,386.675
1,2023.06.01 00:15,27096.9,27096.9,27036.7,27047.0,408.68
2,2023.06.01 00:30,27047.0,27077.4,27041.0,27054.9,275.08


In [2]:
386.675*((27108.1+27080.6)/2)

10476707.786249999

In [3]:
2.81566749*((71063+71053)/2) #, '200082', '2.81566749'

200075.70050442

In [4]:
df.columns

Index(['time', 'open', 'high', 'low', 'close', 'vol'], dtype='object')

In [5]:
len(df.high)

11716

In [6]:
# clear column names of whitespaces etc. for renaming purposes
df.columns = df.columns.str.strip()

# rename vol to vol_coin bc of API call consistency data mapping 
df.rename(columns={"vol": "vol_coin"},inplace=True)


# idx = 0
# new_col = [7, 8, 9]  # can be a list, a Series, an array or a scalar   
# df.insert(loc=idx, column='A', value=new_col)

# add vol of coin in dollar currency
vol = df.vol_coin * ((df.high+df.low)/2)

df.insert(loc = 5, column = "vol",  value=vol)


In [7]:
df.head(1)

Unnamed: 0,time,open,high,low,close,vol,vol_coin
0,2023.06.01 00:00,27103.1,27108.1,27080.6,27096.9,10476710.0,386.675


In [8]:
# transform to date_time
df["time"] = pd.to_datetime(df["time"])

# Extracting day and hour
df["day"] = df["time"].dt.day
df["hour"] = df["time"].dt.hour

# the 10-day moving average
df["10MA"] = df["close"].rolling(window=10).mean().shift(-9)

# Calculate the standard deviation of the closing prices over the same 20-day period
df["10STD"] = df["close"].rolling(window=10).std().shift(-9)

# volume weighted close price
df["vwap"] = df.close/df.vol

# Bollinger Band: 
# simple_moving_average(20) + std x 2 | sma - std x 2
df["bb_upper_band"] = df["10MA"] + 2 * df["10STD"]
df["bb_lower_band"] = df["10MA"] - 2 * df["10STD"]

# daily_open
# df["daily_open"] = df.groupby(df["day"])["open"].transform("first")

# daily_close
# df["daily_close"] = df.groupby(df["day"])["close"].transform("first")


# Calculate On-Balance Volume (OBV)
df["change"] = df["close"].diff()
df["direction"] = df["change"].apply(lambda x: 1 if x > 0 else (-1 if x < 0 else 0))
df["obv"] = df["vol"] * df["direction"]
df["obv"] = df["obv"].cumsum()

# define variables that look 1 to three periods into the future
df["open_p1"] = df["open"].shift(1)
df["high_p1"] = df["high"].shift(1)
df["low_p1"] = df["low"].shift(1)
df["close_p1"] = df["close"].shift(1)

df["open_p2"] = df["open"].shift(2)
df["high_p2"] = df["high"].shift(2)
df["low_p2"] = df["low"].shift(2)
df["close_p2"] = df["close"].shift(2)

df["open_p3"] = df["open"].shift(3)
df["high_p3"] = df["high"].shift(3)
df["low_p3"] = df["low"].shift(3)
df["close_p3"] = df["close"].shift(3)


print(df.shape)
df[18:].head(3) # data set starts with the 20th observation since the first 20 rows had to be used for the moving average

(11716, 29)


Unnamed: 0,time,open,high,low,close,vol,vol_coin,day,hour,10MA,...,low_p1,close_p1,open_p2,high_p2,low_p2,close_p2,open_p3,high_p3,low_p3,close_p3
18,2023-06-01 04:30:00,27131.7,27152.2,27103.3,27103.4,17847270.0,657.897,1,4,26905.04,...,27089.4,27131.7,27060.2,27135.0,27050.0,27115.9,27103.6,27103.6,27012.0,27060.2
19,2023-06-01 04:45:00,27103.4,27144.9,27064.5,27069.3,26042600.0,960.815,1,4,26872.79,...,27103.3,27103.4,27115.9,27133.0,27089.4,27131.7,27060.2,27135.0,27050.0,27115.9
20,2023-06-01 05:00:00,27069.3,27078.8,27036.7,27044.4,12208350.0,451.196,1,5,26840.86,...,27064.5,27069.3,27131.7,27152.2,27103.3,27103.4,27115.9,27133.0,27089.4,27131.7


In [9]:
df.tail(10)

Unnamed: 0,time,open,high,low,close,vol,vol_coin,day,hour,10MA,...,low_p1,close_p1,open_p2,high_p2,low_p2,close_p2,open_p3,high_p3,low_p3,close_p3
11706,2023-09-30 21:45:00,26984.3,26993.8,26978.2,26993.7,4988821.0,184.867,30,21,27021.97,...,26984.0,26984.3,27008.9,27009.0,26990.0,26999.9,27008.6,27009.0,26992.0,27008.9
11707,2023-09-30 22:00:00,26993.7,26997.0,26974.3,26983.0,4819691.0,178.602,30,22,,...,26978.2,26993.7,26999.9,27004.7,26984.0,26984.3,27008.9,27009.0,26990.0,26999.9
11708,2023-09-30 22:15:00,26983.0,26990.3,26981.3,26984.5,3140068.0,116.36,30,22,,...,26974.3,26983.0,26984.3,26993.8,26978.2,26993.7,26999.9,27004.7,26984.0,26984.3
11709,2023-09-30 22:30:00,26984.5,27026.0,26984.5,27025.1,7208268.0,266.921,30,22,,...,26981.3,26984.5,26993.7,26997.0,26974.3,26983.0,26984.3,26993.8,26978.2,26993.7
11710,2023-09-30 22:45:00,27025.1,27025.2,27009.7,27011.5,5532147.0,204.762,30,22,,...,26984.5,27025.1,26983.0,26990.3,26981.3,26984.5,26993.7,26997.0,26974.3,26983.0
11711,2023-09-30 23:00:00,27011.5,27038.0,27011.5,27026.8,8725913.0,322.886,30,23,,...,27009.7,27011.5,26984.5,27026.0,26984.5,27025.1,26983.0,26990.3,26981.3,26984.5
11712,2023-09-30 23:15:00,27026.8,27044.3,27014.6,27016.8,5934343.0,219.551,30,23,,...,27011.5,27026.8,27025.1,27025.2,27009.7,27011.5,26984.5,27026.0,26984.5,27025.1
11713,2023-09-30 23:30:00,27016.8,27058.2,27016.8,27057.3,5358238.0,198.178,30,23,,...,27014.6,27016.8,27011.5,27038.0,27011.5,27026.8,27025.1,27025.2,27009.7,27011.5
11714,2023-09-30 23:45:00,27057.3,27061.2,27046.8,27054.4,4924558.0,182.027,30,23,,...,27016.8,27057.3,27026.8,27044.3,27014.6,27016.8,27011.5,27038.0,27011.5,27026.8
11715,2023-10-01 00:00:00,27054.4,27066.7,27054.3,27066.6,6138593.0,226.847,1,0,,...,27046.8,27054.4,27016.8,27058.2,27016.8,27057.3,27026.8,27044.3,27014.6,27016.8


In [10]:
df.head(5).close.mean()

27076.559999999998

In [11]:
df.head(10)

Unnamed: 0,time,open,high,low,close,vol,vol_coin,day,hour,10MA,...,low_p1,close_p1,open_p2,high_p2,low_p2,close_p2,open_p3,high_p3,low_p3,close_p3
0,2023-06-01 00:00:00,27103.1,27108.1,27080.6,27096.9,10476710.0,386.675,1,0,27117.44,...,,,,,,,,,,
1,2023-06-01 00:15:00,27096.9,27096.9,27036.7,27047.0,11061660.0,408.68,1,0,27132.45,...,27080.6,27096.9,,,,,,,,
2,2023-06-01 00:30:00,27047.0,27077.4,27041.0,27054.9,7443445.0,275.08,1,0,27147.65,...,27036.7,27047.0,27103.1,27108.1,27080.6,27096.9,,,,
3,2023-06-01 00:45:00,27054.9,27084.0,27054.8,27084.0,5905000.0,218.143,1,0,27172.57,...,27041.0,27054.9,27096.9,27096.9,27036.7,27047.0,27103.1,27108.1,27080.6,27096.9
4,2023-06-01 01:00:00,27084.0,27113.9,27073.5,27100.0,8924990.0,329.412,1,1,27191.57,...,27054.8,27084.0,27047.0,27077.4,27041.0,27054.9,27096.9,27096.9,27036.7,27047.0
5,2023-06-01 01:15:00,27100.0,27159.0,27100.0,27142.4,26577550.0,979.655,1,1,27191.93,...,27073.5,27100.0,27054.9,27084.0,27054.8,27084.0,27047.0,27077.4,27041.0,27054.9
6,2023-06-01 01:30:00,27142.4,27158.3,27120.0,27137.2,13917230.0,512.81,1,1,27183.71,...,27100.0,27142.4,27084.0,27113.9,27073.5,27100.0,27054.9,27084.0,27054.8,27084.0
7,2023-06-01 01:45:00,27137.2,27180.0,27133.9,27170.2,11620190.0,427.89,1,1,27181.58,...,27120.0,27137.2,27100.0,27159.0,27100.0,27142.4,27084.0,27113.9,27073.5,27100.0
8,2023-06-01 02:00:00,27170.2,27181.0,27147.2,27156.9,17759050.0,653.769,1,2,27177.73,...,27133.9,27170.2,27142.4,27158.3,27120.0,27137.2,27100.0,27159.0,27100.0,27142.4
9,2023-06-01 02:15:00,27156.9,27185.0,27145.5,27184.9,8651399.0,318.473,1,2,27172.38,...,27147.2,27156.9,27137.2,27180.0,27133.9,27170.2,27142.4,27158.3,27120.0,27137.2


In [12]:
df.head(10).close.mean()

27117.440000000002

## dependent variable

In [13]:

# GOAL:
# if the price is outside of the bollinger bands: 
#    is the next candle 0.5% away from the bb value?

# Create a new column "y_bb" initialized with False
df["y_bb"] = False

# Check conditions and set values accordingly
df.loc[(df["low"] < df["bb_lower_band"]) & (df["high_p1"] > df["bb_lower_band"]*1.002), "y_bb"] = True
df.loc[(df["high"] > df["bb_upper_band"]) & (df["low_p1"] < df["bb_upper_band"]/1.002), "y_bb"] = True

In [14]:
df.y_bb.sum()

1349

# XGBoost

In [15]:
df.columns

Index(['time', 'open', 'high', 'low', 'close', 'vol', 'vol_coin', 'day',
       'hour', '10MA', '10STD', 'vwap', 'bb_upper_band', 'bb_lower_band',
       'change', 'direction', 'obv', 'open_p1', 'high_p1', 'low_p1',
       'close_p1', 'open_p2', 'high_p2', 'low_p2', 'close_p2', 'open_p3',
       'high_p3', 'low_p3', 'close_p3', 'y_bb'],
      dtype='object')

In [16]:
from sklearn.model_selection import train_test_split

cols = ['open', 'high', 'low', 'close', 'vol', 'day', 'hour', '10MA', 
       '10STD', 'vwap', 'bb_upper_band', 'bb_lower_band', 'daily_open', 
       'daily_close', 'change', 'direction', 'obv']

df = df.dropna()
X,y = df[cols],df["y_bb"]

# do the train test splitting 
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=None,shuffle=False) #do the split on variable y to avoid zero y entries

KeyError: "['daily_open', 'daily_close'] not in index"

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

xg = GradientBoostingClassifier() # min_samples_leaf=4
xg.fit(X_train, y_train)

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

y_pred_test = xg.predict(X_test)
accuracy_score(y_test, y_pred_test)

In [None]:
conf_m = confusion_matrix(y_test, y_pred_test)
conf_m

In [None]:
import seaborn as sns
sns.heatmap(conf_m/np.sum(conf_m,axis=0), annot=True, fmt='.2%', cmap='Blues')

In [None]:
print(classification_report(y_test, y_pred_test))

Overall, it seems like a useful model. However it is spurious that the precision is at exactly 75%. 

More ways to improve the model would be to 
- get more data
- add other symbols, like ETH, EUR/USD pair, S&P 500, etc.

In [None]:
pd.DataFrame(dict(cols=X_train.columns, imp=xg.feature_importances_)).sort_values(by=['imp']).plot('cols', 'imp', 'barh');

In [None]:
import pickle

# Save the model to a file
with open('../bb-bot/src/xgboost_model.pkl', 'wb') as file:
    pickle.dump(xg, file)


In [None]:

# Load the model from the file
with open('xgboost_model.pkl', 'rb') as file:
    model = pickle.load(file)


In [None]:
pred = model.predict(X_test.head(1))
pred

In [None]:
X_test.head(20)

# use live data

In [None]:
import requests
import time
import pandas as pd

# Define the endpoint and parameters
base_url = "https://api-testnet.bybit.com"
endpoint = "/v5/market/kline"
symbol = "BTCUSD"  # Example trading pair
interval = "1"    # Kline interval, e.g., "1", "5", "15", "60", "240", etc.
limit = 4        # Number of klines to fetch
start_time = int(time.time()) - 3600 * 24  # Start time in seconds (e.g., 24 hours ago)

# Construct the URL
url = f"{base_url}{endpoint}?category=linear&symbol={symbol}&interval={interval}&limit={limit}"#&start_time={start_time}"

# Make the GET request
response = requests.get(url,headers={}, data={})
data = response.json() 

print(data)
print(pd.Timestamp(data["time"],unit="ms"))
print(data["result"]["list"])

