# Data preperation notebook

In [1]:
import requests
import time
import pandas as pd

# Define the endpoint and parameters
base_url = "https://api-testnet.bybit.com"
endpoint = "/v5/market/kline"
symbol = "BTCUSD"  # Example trading pair
interval = "15"    # Kline interval, e.g., "1", "5", "15", "60", "240", etc.
limit = 10        # Number of klines to fetch
start_time = int(time.time()) - 3600 * 24  # Start time in seconds (e.g., 24 hours ago)

# Construct the URL
url = f"{base_url}{endpoint}?category=linear&symbol={symbol}&interval={interval}&limit={limit}"#&start_time={start_time}"

# Make the GET request
response = requests.get(url,headers={}, data={})
data = response.json() 

print(data)
print(pd.Timestamp(data["time"],unit="ms"))
print(data["result"]["list"])



{'retCode': 0, 'retMsg': 'OK', 'result': {'symbol': 'BTCUSD', 'category': 'linear', 'list': [['1716822000000', '69497', '69736.5', '69497', '69712.5', '6179535', '88.71829642'], ['1716821100000', '69129', '69497', '69083.5', '69497', '15505718', '223.67244134'], ['1716820200000', '69060', '69294.5', '69055.5', '69129', '13321903', '192.63721571'], ['1716819300000', '68995', '69175.5', '68952', '69060', '13541302', '196.06036964'], ['1716818400000', '68934.5', '69004.5', '68859.5', '68995', '12773788', '185.28870847'], ['1716817500000', '68905', '69043', '68895.5', '68934.5', '11868576', '172.10881686'], ['1716816600000', '68848', '68914', '68719.5', '68905', '12674260', '184.13571694'], ['1716815700000', '68776', '69033', '68776', '68848', '14587827', '211.70810929'], ['1716814800000', '68624', '68796', '68592', '68776', '9857688', '143.47052991'], ['1716813900000', '68589', '68633.5', '68565.5', '68624', '7062041', '102.96127408']]}, 'retExtInfo': {}, 'time': 1716822175186}
2024-05-27

In [2]:
#def prepare_data(data):
print(pd.to_datetime(int(data["result"]["list"][0][0]),unit='ms'))
print(pd.to_datetime(int(data["result"]["list"][1][0]),unit='ms'))
print(pd.to_datetime(int(data["result"]["list"][2][0]),unit='ms'))
print(pd.to_datetime(int(data["result"]["list"][3][0]),unit='ms'))



2024-05-27 15:00:00
2024-05-27 14:45:00
2024-05-27 14:30:00
2024-05-27 14:15:00


In [42]:
import pandas as pd
import numpy as np

def prepare_data(data):
    
    # define pandas DF 
    cols = ['time','open', 'high', 'low', 'close','vol','vol_coin']
    df = pd.DataFrame(data["result"]["list"], columns=cols)
    
    df['time'] = df['time'].astype(int)
    
    df["time"] = pd.to_datetime(df["time"],unit="ms")
    df["day"] = df["time"].dt.day
    df["hour"] = df["time"].dt.hour

    # the 10-day moving average
    df["10MA"] = df["close"].astype('float64').mean()
    
    # Calculate the standard deviation of the closing prices over the same 20-day period
    df["10STD"] = df["close"].astype('float64').std()
    
    # volume weighted close price
    df["vwap"] = df.close.astype('float64')/df.vol.astype('float64')
    
    # Bollinger Band: 
    # simple_moving_average(20) + std x 2 | sma - std x 2
    df["bb_upper_band"] = df["10MA"].astype('float64') + 2 * df["10STD"].astype('float64')
    df["bb_lower_band"] = df["10MA"].astype('float64') - 2 * df["10STD"].astype('float64')

    # Calculate On-Balance Volume (OBV)
    df["change"] = df["close"].astype('float64').diff(periods=-1)#.shift(-1)
    df["direction"] = df["change"].astype('float64').apply(lambda x: 1 if x > 0 else (-1 if x < 0 else 0))
    df["obv"] = df["vol"].astype('float64') * df["direction"].astype('float64')
    df["obv"] = df["obv"].astype('float64').cumsum()

    # Stack the DataFrame for all non-constant rows of the period (OHLC,vol,vwap,obv)
    stacked_df_cols = ["open","high","low","close","vol","vol_coin","vwap","obv","change","direction"]
    stacked_df = df.loc[0:len(df)-2,stacked_df_cols].stack() #cut the last row 0:8=9, (10-2) bc of diff = NAN
    
    # Create a new DataFrame from the stacked series and transpose it
    # this creates a multi index data frame with tuples as indices, like [(0,'ts'),...]
    df_single_row = stacked_df.to_frame().T
    
    # now change the multiindex col to a single index col by replacing it witht a list of concatenated strings 
    df_single_row.columns = [f'{col[1]}_{col[0]}' for col in df_single_row.columns]

    # get the first row of the bollinger band metrics that are a constant for the 10 observations
    bb_cols = ["10MA","10STD","bb_upper_band","bb_lower_band"]
    df_bb_metrics = df[bb_cols].head(1)

    # collect meta data
    meta_cols = ["time","day","hour"]
    df_meta = df[meta_cols].head(1)

    # put cols side by side
    df_concat = pd.concat([df_meta,df_bb_metrics,df_single_row],axis=1)
    
    return df,stacked_df,df_single_row,df_concat
    
df,stack_df,single_df,prep_df = prepare_data(data)

In [27]:
df

Unnamed: 0,time,open,high,low,close,vol,vol_coin,day,hour,10MA,10STD,vwap,bb_upper_band,bb_lower_band,change,direction,obv
0,2024-05-27 15:00:00,69497.0,69736.5,69497.0,69712.5,6179535,88.71829642,27,15,69048.1,329.943834,0.011281,69707.987667,68388.212333,-215.5,-1,-6179535.0
1,2024-05-27 14:45:00,69129.0,69497.0,69083.5,69497.0,15505718,223.67244134,27,14,69048.1,329.943834,0.004482,69707.987667,68388.212333,-368.0,-1,-21685253.0
2,2024-05-27 14:30:00,69060.0,69294.5,69055.5,69129.0,13321903,192.63721571,27,14,69048.1,329.943834,0.005189,69707.987667,68388.212333,-69.0,-1,-35007156.0
3,2024-05-27 14:15:00,68995.0,69175.5,68952.0,69060.0,13541302,196.06036964,27,14,69048.1,329.943834,0.0051,69707.987667,68388.212333,-65.0,-1,-48548458.0
4,2024-05-27 14:00:00,68934.5,69004.5,68859.5,68995.0,12773788,185.28870847,27,14,69048.1,329.943834,0.005401,69707.987667,68388.212333,-60.5,-1,-61322246.0
5,2024-05-27 13:45:00,68905.0,69043.0,68895.5,68934.5,11868576,172.10881686,27,13,69048.1,329.943834,0.005808,69707.987667,68388.212333,-29.5,-1,-73190822.0
6,2024-05-27 13:30:00,68848.0,68914.0,68719.5,68905.0,12674260,184.13571694,27,13,69048.1,329.943834,0.005437,69707.987667,68388.212333,-57.0,-1,-85865082.0
7,2024-05-27 13:15:00,68776.0,69033.0,68776.0,68848.0,14587827,211.70810929,27,13,69048.1,329.943834,0.00472,69707.987667,68388.212333,-72.0,-1,-100452909.0
8,2024-05-27 13:00:00,68624.0,68796.0,68592.0,68776.0,9857688,143.47052991,27,13,69048.1,329.943834,0.006977,69707.987667,68388.212333,-152.0,-1,-110310597.0
9,2024-05-27 12:45:00,68589.0,68633.5,68565.5,68624.0,7062041,102.96127408,27,12,69048.1,329.943834,0.009717,69707.987667,68388.212333,,0,-110310597.0


In [44]:
stack_df[8]

open                68624
high                68796
low                 68592
close               68776
vol               9857688
vol_coin     143.47052991
vwap             0.006977
obv           110310597.0
change              152.0
direction               1
dtype: object

In [43]:
stacked_df_cols = ["open","high","low","close","vol","vol_coin","vwap","obv","change","direction"]
print(len(stacked_df_cols))
print(len(df)-1)
df.loc[0:len(df)-2,stacked_df_cols]

10
9


Unnamed: 0,open,high,low,close,vol,vol_coin,vwap,obv,change,direction
0,69497.0,69736.5,69497.0,69712.5,6179535,88.71829642,0.011281,6179535.0,215.5,1
1,69129.0,69497.0,69083.5,69497.0,15505718,223.67244134,0.004482,21685253.0,368.0,1
2,69060.0,69294.5,69055.5,69129.0,13321903,192.63721571,0.005189,35007156.0,69.0,1
3,68995.0,69175.5,68952.0,69060.0,13541302,196.06036964,0.0051,48548458.0,65.0,1
4,68934.5,69004.5,68859.5,68995.0,12773788,185.28870847,0.005401,61322246.0,60.5,1
5,68905.0,69043.0,68895.5,68934.5,11868576,172.10881686,0.005808,73190822.0,29.5,1
6,68848.0,68914.0,68719.5,68905.0,12674260,184.13571694,0.005437,85865082.0,57.0,1
7,68776.0,69033.0,68776.0,68848.0,14587827,211.70810929,0.00472,100452909.0,72.0,1
8,68624.0,68796.0,68592.0,68776.0,9857688,143.47052991,0.006977,110310597.0,152.0,1


In [46]:
len(prep_df.columns)

97

In [7]:
prep_df.change_8

0   -152.0
Name: change_8, dtype: object

In [36]:
prep_df.head()

Unnamed: 0,time,day,hour,10MA,10STD,bb_upper_band,bb_lower_band,open_0,high_0,low_0,...,direction_7,open_8,high_8,low_8,close_8,vol_8,vol_coin_8,vwap_8,obv_8,direction_8
0,2024-05-27 15:00:00,27,15,69048.1,329.943834,69707.987667,68388.212333,69497,69736.5,69497,...,1,68624,68796,68592,68776,9857688,143.47052991,0.006977,100452909.0,0


In [59]:
stacked_df_cols = ["open","high","low","close","vwap","obv","change","direction"]
prep_df[stacked_df_cols]

Unnamed: 0,open,high,low,close,vwap,obv,change,direction
0,68848.0,68914.0,68719.5,68890.0,0.006231,-11055257.0,-42.0,-1
1,68776.0,69033.0,68776.0,68848.0,0.00472,-25643084.0,-72.0,-1
2,68624.0,68796.0,68592.0,68776.0,0.006977,-35500772.0,-152.0,-1
3,68589.0,68633.5,68565.5,68624.0,0.009717,-42562813.0,-35.0,-1
4,68471.5,68608.0,68471.5,68589.0,0.008056,-51077245.0,-117.5,-1
5,68383.0,68477.5,68383.0,68471.5,0.010979,-57313938.0,-88.5,-1
6,68436.0,68459.0,68364.0,68383.0,0.011204,-51210493.0,53.0,1
7,68348.5,68444.5,68314.5,68436.0,0.009806,-58189798.0,-87.5,-1
8,68486.0,68506.5,68249.0,68348.5,0.006046,-46884617.0,137.5,1
9,68607.0,68608.5,68486.0,68486.0,0.020397,-46884617.0,,0


In [68]:
bb_cols = ["10MA","10STD","bb_upper_band","bb_lower_band"]
prep_df[bb_cols].head(1)

pd.concat([prep_df[stacked_df_cols].head(1),prep_df[bb_cols].head(1)],axis=1)

Unnamed: 0,open,high,low,close,vwap,obv,change,direction,10MA,10STD,bb_upper_band,bb_lower_band
0,68848,68914,68719.5,68890,0.006231,-11055257.0,-42.0,-1,68585.2,194.888031,68974.976061,68195.423939


In [None]:
    # applying logs
    # for col in cols:
    #    df["log_"+str(col)] = np.log(df[str(col)].astype(''float64'64')+1)

    # >>> AS FOR NOW useless
    # normalize along "MinMaxScaler" (same like sci kit learn)
    # for col in cols+["log_vol"]:
    #    df["norm_"+str(col)] =  (df[str(col)].astype(''float64'64') - np.min(df[str(col)].astype(''float64'64')) ) / ( np.max(df[str(col)].astype(''float64'64')) - np.min(df[str(col)].astype(''float64'64')) ) 

    # Stack the DataFrame for all non-constant rows of the period (OHLC,vol,vwap,obv)
    stacked_df_cols = ["open","high","low","close","vwap","obv","change","direction"]
    stacked_df = df[stacked_df_cols].stack()
    
    # Create a new DataFrame from the stacked series and transpose it
    # this creates a multi index data frame with tuples as indices, like [(0,'ts'),...]
    df_single_row = stacked_df.to_frame().T
    
    # now change the multiindex col to a single index col by replacing it witht a list of concatenated strings 
    df_single_row.columns = [f'{col[1]}_{col[0]}' for col in df_single_row.columns]

    # get the first row of the bollinger band metrics that are a constant for the 10 observations
    bb_cols = ["10MA","10STD","bb_upper_band","bb_lower_band"]
    df_bb_metrics = df[bb_cols].head(1)

    # put cols side by side
    df_concat = pd.concat([df_bb_metrics,df_single_row],axis=1)

    
    
    return df_concat



prep_df = prepare_data(data)
prep_df

# Creating a database

In [4]:
from sqlalchemy import create_engine
#engine = create_engine("sqlite+pysqlite:///:memory:", echo=True)

engine = create_engine('sqlite:///BTCUSDTstream-test.db')

In [5]:
# write function to transform data from bybit call    

In [6]:
prep_df.to_sql('BTCUSDT',engine, if_exists = 'append', index=False)

1

In [7]:
pd.read_sql('BTCUSDT',engine)

Unnamed: 0,ts_0,open_0,high_0,low_0,close_0,vol_0,vol_coin_0,log_ts_0,log_open_0,log_high_0,...,close_3,vol_3,vol_coin_3,log_ts_3,log_open_3,log_high_3,log_low_3,log_close_3,log_vol_3,log_vol_coin_3
0,1716284040000,71056.5,71063.0,71053.0,71056.5,200082,2.81566749,28.171183,11.171245,11.171336,...,71037,13,0.00018291,28.171183,11.170815,11.17097,11.170815,11.17097,2.639057,0.000183
1,1716562500000,67863.5,67884.5,67863.5,67884.5,685944,10.10657531,28.171345,11.125268,11.125578,...,67778,115,0.0016968,28.171345,11.123698,11.124008,11.123698,11.124008,4.75359,0.001695


In essence the function calls would look like this: 

response = fetch_kline()
pred_df = prepare_data(response)

entry,stop_loss,take_profit = make_prediction(prep_df) # add column if trade or not 

write_db(prep_df)

place_order(entry,stop_loss,take_profit)

