In [1]:
import pandas as pd

from utils.common import day_aware_shift, plot_confusion_matrix
from utils.features import create_lag, create_rsi, create_dst, create_ma_ratio, create_z, \
    create_bollinger_band

import pickle
import datetime

In [2]:
df_wti_1m = pickle.load(open('./large_files/cl-1m-2.pkl', 'rb'))
df_wti_1m = df_wti_1m[(df_wti_1m['datetime'].dt.time >= datetime.time(hour=7, minute=0)) & (df_wti_1m['datetime'].dt.hour <= 16)]

df = df_wti_1m.copy()
df['month'] = df['datetime'].dt.month
df.head(3)

Unnamed: 0,date,time,open,high,low,close,volume,datetime,day,month
525,2007-04-02,700,65.74,65.77,65.74,65.77,4,2007-04-02 07:00:00-04:00,Monday,4
526,2007-04-02,701,65.75,65.75,65.7,65.7,20,2007-04-02 07:01:00-04:00,Monday,4
527,2007-04-02,702,65.72,65.72,65.65,65.66,40,2007-04-02 07:02:00-04:00,Monday,4


In [3]:
df = create_bollinger_band(df, period=30)
df['dollar_vol'] = df[['close', 'volume']].prod(axis=1)
df.info(verbose=3, show_counts=True)

bollinger_band_30: 100%|██████████| 6142/6142 [00:04<00:00, 1426.88it/s]


<class 'pandas.core.frame.DataFrame'>
Int64Index: 2579355 entries, 525 to 92654
Data columns (total 13 columns):
 #   Column      Non-Null Count    Dtype                           
---  ------      --------------    -----                           
 0   date        2579355 non-null  object                          
 1   time        2579355 non-null  object                          
 2   open        2579355 non-null  float64                         
 3   high        2579355 non-null  float64                         
 4   low         2579355 non-null  float64                         
 5   close       2579355 non-null  float64                         
 6   volume      2579355 non-null  int64                           
 7   datetime    2579355 non-null  datetime64[ns, America/New_York]
 8   day         2579355 non-null  object                          
 9   month       2579355 non-null  int64                           
 10  bb_high     2453495 non-null  float64                         
 11

In [4]:
for period in [5, 15, 30, 60]:
    df = create_z(df, period, 'close')
    df = create_z(df, period, 'volume')
    df = create_z(df, period, 'dollar_vol')

df.info(verbose=3, show_counts=True)

z_5 close: 100%|██████████| 6142/6142 [00:03<00:00, 1639.65it/s]
z_5 volume: 100%|██████████| 6142/6142 [00:03<00:00, 1633.42it/s]
z_5 dollar_vol: 100%|██████████| 6142/6142 [00:03<00:00, 1616.33it/s]
z_15 close: 100%|██████████| 6142/6142 [00:03<00:00, 1601.92it/s]
z_15 volume: 100%|██████████| 6142/6142 [00:03<00:00, 1547.81it/s]
z_15 dollar_vol: 100%|██████████| 6142/6142 [00:03<00:00, 1574.40it/s]
z_30 close: 100%|██████████| 6142/6142 [00:03<00:00, 1595.38it/s]
z_30 volume: 100%|██████████| 6142/6142 [00:03<00:00, 1551.78it/s]
z_30 dollar_vol: 100%|██████████| 6142/6142 [00:03<00:00, 1571.69it/s]
z_60 close: 100%|██████████| 6142/6142 [00:04<00:00, 1520.79it/s]
z_60 volume: 100%|██████████| 6142/6142 [00:03<00:00, 1572.33it/s]
z_60 dollar_vol: 100%|██████████| 6142/6142 [00:03<00:00, 1555.43it/s]


<class 'pandas.core.frame.DataFrame'>
Int64Index: 2579355 entries, 525 to 92654
Data columns (total 61 columns):
 #   Column              Non-Null Count    Dtype                           
---  ------              --------------    -----                           
 0   date                2579355 non-null  object                          
 1   time                2579355 non-null  object                          
 2   open                2579355 non-null  float64                         
 3   high                2579355 non-null  float64                         
 4   low                 2579355 non-null  float64                         
 5   close               2579355 non-null  float64                         
 6   volume              2579355 non-null  int64                           
 7   datetime            2579355 non-null  datetime64[ns, America/New_York]
 8   day                 2579355 non-null  object                          
 9   month               2579355 non-null  int64   

In [5]:
df = create_dst(df, [5, 15, 30, 60])
df = create_rsi(df, [5, 15, 30, 60])

df.info(verbose=3, show_counts=True)

dst: 100%|██████████| 6142/6142 [00:22<00:00, 277.21it/s]
rsi: 100%|██████████| 6142/6142 [00:27<00:00, 224.98it/s]


<class 'pandas.core.frame.DataFrame'>
Int64Index: 2579355 entries, 525 to 92654
Data columns (total 89 columns):
 #   Column              Non-Null Count    Dtype                           
---  ------              --------------    -----                           
 0   date                2579355 non-null  object                          
 1   time                2579355 non-null  object                          
 2   open                2579355 non-null  float64                         
 3   high                2579355 non-null  float64                         
 4   low                 2579355 non-null  float64                         
 5   close               2579355 non-null  float64                         
 6   volume              2579355 non-null  int64                           
 7   datetime            2579355 non-null  datetime64[ns, America/New_York]
 8   day                 2579355 non-null  object                          
 9   month               2579355 non-null  int64   

In [7]:
for period in [5, 15, 30, 60]:
    for sub_period in [5, 15, 30, 60]:
        df = create_lag(df, f'z_close_{period}m', sub_period)
        df = create_lag(df, f'z_volume_{period}m', sub_period)
        df = create_lag(df, f'dst_high_{period}m', sub_period)
        df = create_lag(df, f'dst_low_{period}m', sub_period)

df.info(verbose=3, show_counts=True)

lag5m_z_close_5m: 100%|██████████| 6142/6142 [00:02<00:00, 2759.59it/s]
lag5m_z_volume_5m: 100%|██████████| 6142/6142 [00:01<00:00, 4052.20it/s]
lag5m_dst_high_5m: 100%|██████████| 6142/6142 [00:01<00:00, 4089.57it/s]
lag5m_dst_low_5m: 100%|██████████| 6142/6142 [00:01<00:00, 5147.94it/s]
lag15m_z_close_5m: 100%|██████████| 6142/6142 [00:01<00:00, 4033.40it/s]
lag15m_z_volume_5m: 100%|██████████| 6142/6142 [00:01<00:00, 4824.63it/s]
lag15m_dst_high_5m: 100%|██████████| 6142/6142 [00:01<00:00, 3449.53it/s]
lag15m_dst_low_5m: 100%|██████████| 6142/6142 [00:01<00:00, 3731.30it/s]
lag30m_z_close_5m: 100%|██████████| 6142/6142 [00:01<00:00, 3806.39it/s]
lag30m_z_volume_5m: 100%|██████████| 6142/6142 [00:01<00:00, 4005.84it/s]
lag30m_dst_high_5m: 100%|██████████| 6142/6142 [00:01<00:00, 4113.90it/s]
lag30m_dst_low_5m: 100%|██████████| 6142/6142 [00:01<00:00, 4761.89it/s]
lag60m_z_close_5m: 100%|██████████| 6142/6142 [00:01<00:00, 4899.83it/s]
lag60m_z_volume_5m: 100%|██████████| 6142/6142 [0

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2579355 entries, 525 to 92654
Data columns (total 153 columns):
 #    Column               Non-Null Count    Dtype                           
---   ------               --------------    -----                           
 0    date                 2579355 non-null  object                          
 1    time                 2579355 non-null  object                          
 2    open                 2579355 non-null  float64                         
 3    high                 2579355 non-null  float64                         
 4    low                  2579355 non-null  float64                         
 5    close                2579355 non-null  float64                         
 6    volume               2579355 non-null  int64                           
 7    datetime             2579355 non-null  datetime64[ns, America/New_York]
 8    day                  2579355 non-null  object                          
 9    month                2

In [9]:
for period in [5, 15, 30, 60]:
    for sub_period in [5, 15, 30, 60]:
        if sub_period > period:
                df = create_ma_ratio(df, period, sub_period)

df.info(verbose=3, show_counts=True)

ma_ratio_5_15: 100%|██████████| 6142/6142 [00:01<00:00, 3073.84it/s]
ma_ratio_5_30: 100%|██████████| 6142/6142 [00:01<00:00, 3097.84it/s]
ma_ratio_5_60: 100%|██████████| 6142/6142 [00:02<00:00, 2732.87it/s]
ma_ratio_15_30: 100%|██████████| 6142/6142 [00:01<00:00, 3197.89it/s]
ma_ratio_15_60: 100%|██████████| 6142/6142 [00:02<00:00, 2737.38it/s]
ma_ratio_30_60: 100%|██████████| 6142/6142 [00:01<00:00, 3162.58it/s]


<class 'pandas.core.frame.DataFrame'>
Int64Index: 2579355 entries, 525 to 92654
Data columns (total 159 columns):
 #    Column               Non-Null Count    Dtype                           
---   ------               --------------    -----                           
 0    date                 2579355 non-null  object                          
 1    time                 2579355 non-null  object                          
 2    open                 2579355 non-null  float64                         
 3    high                 2579355 non-null  float64                         
 4    low                  2579355 non-null  float64                         
 5    close                2579355 non-null  float64                         
 6    volume               2579355 non-null  int64                           
 7    datetime             2579355 non-null  datetime64[ns, America/New_York]
 8    day                  2579355 non-null  object                          
 9    month                2

In [10]:
with pd.HDFStore('./large_files/data.h5') as store:
    store.put('feature_engineering/20240201', df, format='table')

  check_attribute_name(name)
