## create data 

In [1]:
import pandas as pd

import os
iskaggle = os.environ.get('KAGGLE_KERNEL_RUN_TYPE', '')

if iskaggle:
    df = pd.read_csv("/kaggle/input/btcusdt-2023-6-9/btcusdt-2023-6_9.csv", index_col=0).reset_index(drop=True)
else:
    df = pd.read_csv("../lesson5-random-forests/btc-data/btcusdt-2023-6_9.csv", index_col=0).reset_index(drop=True)


print(df.shape)
df.head(3)

(11716, 6)


Unnamed: 0,time,open,high,low,close,vol
0,2023.06.01 00:00,27103.1,27108.1,27080.6,27096.9,386.675
1,2023.06.01 00:15,27096.9,27096.9,27036.7,27047.0,408.68
2,2023.06.01 00:30,27047.0,27077.4,27041.0,27054.9,275.08


In [2]:
df.columns

Index(['time', 'open', 'high', 'low', 'close', 'vol'], dtype='object')

#### add logs and normalization

In [3]:
["a","b"]+["c"]

['a', 'b', 'c']

In [4]:
import numpy as np

cols = ['open', 'high', 'low', 'close','vol']

# applying logs
for col in cols:
    df["log_"+str(col)] = np.log(df[str(col)])

# normalize along "MinMaxScaler" (same like sci kit learn)
for col in cols+["log_vol"]:
    df["norm_"+str(col)] =  (df[str(col)] - np.min(df[str(col)]) ) / ( np.max(df[str(col)]) - np.min(df[str(col)]) ) 

print(df.shape)
df.head(3)

(11716, 17)


Unnamed: 0,time,open,high,low,close,vol,log_open,log_high,log_low,log_close,log_vol,norm_open,norm_high,norm_low,norm_close,norm_vol,norm_log_vol
0,2023.06.01 00:00,27103.1,27108.1,27080.6,27096.9,386.675,10.207403,10.207588,10.206573,10.207175,5.957585,0.384586,0.316752,0.386895,0.383753,0.005618,0.348938
1,2023.06.01 00:15,27096.9,27096.9,27036.7,27047.0,408.68,10.207175,10.207175,10.20495,10.205331,6.012932,0.383753,0.315136,0.380997,0.377046,0.00596,0.355982
2,2023.06.01 00:30,27047.0,27077.4,27041.0,27054.9,275.08,10.205331,10.206455,10.20511,10.205623,5.617062,0.377046,0.312322,0.381575,0.378108,0.003885,0.305599


In [7]:
# shift data 3 times, so that in one row there is information on the last 3 candles 
# therefore: the original candle data is the target data

df_s1 = df.shift(1).add_suffix("_s1")
df_s2 = df.shift(2).add_suffix("_s2")
df_s3 = df.shift(3).add_suffix("_s3")
df_s3 = df.shift(3).add_suffix("_s3")
df_s3 = df.shift(3).add_suffix("_s3")

print(df_s3.shape)
df_s3.head(10)[0:2]

(11716, 17)


Unnamed: 0,time_s3,open_s3,high_s3,low_s3,close_s3,vol_s3,log_open_s3,log_high_s3,log_low_s3,log_close_s3,log_vol_s3,norm_open_s3,norm_high_s3,norm_low_s3,norm_close_s3,norm_vol_s3,norm_log_vol_s3
0,,,,,,,,,,,,,,,,,
1,,,,,,,,,,,,,,,,,


In [8]:
df_merge = pd.concat([df_s3,df_s2, df_s1, df], axis=1)
print(df_merge.shape)
df_merge.head(5)

(11716, 68)


Unnamed: 0,time_s3,open_s3,high_s3,low_s3,close_s3,vol_s3,log_open_s3,log_high_s3,log_low_s3,log_close_s3,...,log_high,log_low,log_close,log_vol,norm_open,norm_high,norm_low,norm_close,norm_vol,norm_log_vol
0,,,,,,,,,,,...,10.207588,10.206573,10.207175,5.957585,0.384586,0.316752,0.386895,0.383753,0.005618,0.348938
1,,,,,,,,,,,...,10.207175,10.20495,10.205331,6.012932,0.383753,0.315136,0.380997,0.377046,0.00596,0.355982
2,,,,,,,,,,,...,10.206455,10.20511,10.205623,5.617062,0.377046,0.312322,0.381575,0.378108,0.003885,0.305599
3,2023.06.01 00:00,27103.1,27108.1,27080.6,27096.9,386.675,10.207403,10.207588,10.206573,10.207175,...,10.206698,10.20562,10.206698,5.385151,0.378108,0.313275,0.383429,0.382019,0.003001,0.276083
4,2023.06.01 00:15,27096.9,27096.9,27036.7,27047.0,408.68,10.207175,10.207175,10.20495,10.205331,...,10.207802,10.206311,10.207289,5.797309,0.382019,0.317588,0.385941,0.38417,0.004729,0.328539


In [9]:
df_merge.columns

Index(['time_s3', 'open_s3', 'high_s3', 'low_s3', 'close_s3', 'vol_s3',
       'log_open_s3', 'log_high_s3', 'log_low_s3', 'log_close_s3',
       'log_vol_s3', 'norm_open_s3', 'norm_high_s3', 'norm_low_s3',
       'norm_close_s3', 'norm_vol_s3', 'norm_log_vol_s3', 'time_s2', 'open_s2',
       'high_s2', 'low_s2', 'close_s2', 'vol_s2', 'log_open_s2', 'log_high_s2',
       'log_low_s2', 'log_close_s2', 'log_vol_s2', 'norm_open_s2',
       'norm_high_s2', 'norm_low_s2', 'norm_close_s2', 'norm_vol_s2',
       'norm_log_vol_s2', 'time_s1', 'open_s1', 'high_s1', 'low_s1',
       'close_s1', 'vol_s1', 'log_open_s1', 'log_high_s1', 'log_low_s1',
       'log_close_s1', 'log_vol_s1', 'norm_open_s1', 'norm_high_s1',
       'norm_low_s1', 'norm_close_s1', 'norm_vol_s1', 'norm_log_vol_s1',
       'time', 'open', 'high', 'low', 'close', 'vol', 'log_open', 'log_high',
       'log_low', 'log_close', 'log_vol', 'norm_open', 'norm_high', 'norm_low',
       'norm_close', 'norm_vol', 'norm_log_vol'],
    

In [10]:
df_merge.dropna().to_csv("nnbasic-btc-data.csv")

In [11]:
# do not use the time columns 

df_train = df_merge.filter(items = ['open_s3', 'high_s3', 'low_s3', 'close_s3', 'vol_s3',
       'open_s2', 'high_s2', 'low_s2', 'close_s2', 'vol_s2',
       'open_s1', 'high_s1', 'low_s1', 'close_s1', 'vol_s1', 
       'open', 'high', 'low', 'close', 'vol']).dropna()
print(df_train.shape)
df_train.head(3)

(11713, 20)


Unnamed: 0,open_s3,high_s3,low_s3,close_s3,vol_s3,open_s2,high_s2,low_s2,close_s2,vol_s2,open_s1,high_s1,low_s1,close_s1,vol_s1,open,high,low,close,vol
3,27103.1,27108.1,27080.6,27096.9,386.675,27096.9,27096.9,27036.7,27047.0,408.68,27047.0,27077.4,27041.0,27054.9,275.08,27054.9,27084.0,27054.8,27084.0,218.143
4,27096.9,27096.9,27036.7,27047.0,408.68,27047.0,27077.4,27041.0,27054.9,275.08,27054.9,27084.0,27054.8,27084.0,218.143,27084.0,27113.9,27073.5,27100.0,329.412
5,27047.0,27077.4,27041.0,27054.9,275.08,27054.9,27084.0,27054.8,27084.0,218.143,27084.0,27113.9,27073.5,27100.0,329.412,27100.0,27159.0,27100.0,27142.4,979.655
