In [2]:
import sys
sys.path.append("..")

# Reload modules in /src/ when changed
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np

In [3]:
df = pd.read_pickle('../data/resampled/USDJPY-5m-20240101-20241231.pkl')
labels = pd.read_pickle('../data/labels/direction_labels/USDJPY-5m-20240101-20241231-EMA_CROSS_9_20-TB.pkl')

In [4]:
df.head()

Unnamed: 0,timestamp,open,high,low,close,volume,spread
0,2024-01-01 22:00:00,140.873,140.873,140.8655,140.8655,12000000000.0,0.04575
1,2024-01-01 22:05:00,140.866,140.867,140.866,140.867,4200000000.0,0.05
2,2024-01-01 22:10:00,140.87,140.895,140.8655,140.8655,38420000000.0,0.034941
3,2024-01-01 22:15:00,140.866,140.8685,140.866,140.8685,10500000000.0,0.04925
4,2024-01-01 22:20:00,140.8705,140.8785,140.8675,140.8785,161620000000.0,0.043016


In [8]:
feature_name = 'close'

In [9]:
seq_length = 30

In [10]:
feature_values = df[feature_name].values
feature_values

array([140.8655, 140.867 , 140.8655, ..., 156.9955, 157.0125, 157.0085],
      shape=(74711,))

In [11]:
len_df = df.shape[0]
len_df

74711

## Creating Slices

In [13]:
seq_len = 10

In [46]:
df0 = df.iloc[0:0+seq_len].values
df0.shape

(10, 7)

In [25]:
group = np.full((seq_len, 1), 0)
group.shape

(10, 1)

In [54]:
time = np.arange(0, seq_len).reshape(seq_len, 1)
time.shape

(10, 1)

In [59]:
slice_values = np.hstack([df0, group, time])
slice_values

array([[Timestamp('2024-01-01 22:00:00'), 140.873, 140.873, 140.8655,
        140.8655, 11999999761.58142, 0.04574999999999463, 0, 0],
       [Timestamp('2024-01-01 22:05:00'), 140.86599999999999,
        140.86700000000002, 140.86599999999999, 140.86700000000002,
        4200000047.683716, 0.04999999999998295, 0, 1],
       [Timestamp('2024-01-01 22:10:00'), 140.87, 140.89499999999998,
        140.8655, 140.8655, 38420000106.09627, 0.03494117647057786, 0, 2],
       [Timestamp('2024-01-01 22:15:00'), 140.86599999999999,
        140.86849999999998, 140.86599999999999, 140.86849999999998,
        10500000000.0, 0.04924999999999358, 0, 3],
       [Timestamp('2024-01-01 22:20:00'), 140.8705, 140.8785, 140.8675,
        140.8785, 161620001435.27985, 0.04301587301587224, 0, 4],
       [Timestamp('2024-01-01 22:25:00'), 140.87349999999998, 140.9625,
        140.87349999999998, 140.9305, 238190001919.8656,
        0.0779761904761869, 0, 5],
       [Timestamp('2024-01-01 22:30:00'), 140.9305, 

In [57]:
columns = df.columns.to_list() + ['group', 'time']
columns

['timestamp',
 'open',
 'high',
 'low',
 'close',
 'volume',
 'spread',
 'group',
 'time']

In [60]:
df_slice = pd.DataFrame(slice_values, columns=columns)
df_slice

Unnamed: 0,timestamp,open,high,low,close,volume,spread,group,time
0,2024-01-01 22:00:00,140.873,140.873,140.8655,140.8655,11999999761.58142,0.04575,0,0
1,2024-01-01 22:05:00,140.866,140.867,140.866,140.867,4200000047.683716,0.05,0,1
2,2024-01-01 22:10:00,140.87,140.895,140.8655,140.8655,38420000106.09627,0.034941,0,2
3,2024-01-01 22:15:00,140.866,140.8685,140.866,140.8685,10500000000.0,0.04925,0,3
4,2024-01-01 22:20:00,140.8705,140.8785,140.8675,140.8785,161620001435.27985,0.043016,0,4
5,2024-01-01 22:25:00,140.8735,140.9625,140.8735,140.9305,238190001919.8656,0.077976,0,5
6,2024-01-01 22:30:00,140.9305,140.9385,140.9305,140.935,75200000166.89299,0.090167,0,6
7,2024-01-01 22:35:00,140.9385,140.9805,140.9015,140.9015,127150001168.25104,0.073375,0,7
8,2024-01-01 22:40:00,140.9025,140.9095,140.902,140.9095,62550000667.572014,0.083545,0,8
9,2024-01-01 22:45:00,140.9045,140.91,140.9045,140.908,82500002503.39507,0.065306,0,9


## Loop through the whole dataframe

In [103]:
df_sequential = pd.DataFrame(
    np.vstack(
        [np.hstack([
            df.iloc[i:i+seq_len].values,  
            np.full((seq_len, 1), i),
            np.arange(0, seq_len).reshape(seq_len, 1)
        ]) for i in range(0, df.shape[0]-seq_len)]),
    columns=df.columns.to_list() + ['group', 'seq'],
)


In [104]:
df_sequential

Unnamed: 0,timestamp,open,high,low,close,volume,spread,group,seq
0,2024-01-01 22:00:00,140.873,140.873,140.8655,140.8655,11999999761.581421,0.04575,0,0
1,2024-01-01 22:05:00,140.866,140.867,140.866,140.867,4200000047.683716,0.05,0,1
2,2024-01-01 22:10:00,140.87,140.895,140.8655,140.8655,38420000106.096268,0.034941,0,2
3,2024-01-01 22:15:00,140.866,140.8685,140.866,140.8685,10500000000.0,0.04925,0,3
4,2024-01-01 22:20:00,140.8705,140.8785,140.8675,140.8785,161620001435.279846,0.043016,0,4
...,...,...,...,...,...,...,...,...,...
747005,2024-12-30 23:30:00,157.028,157.052,156.987,157.0185,1337650005936.622559,0.010489,74700,5
747006,2024-12-30 23:35:00,157.016,157.016,156.961,157.0075,1602750017762.184082,0.011005,74700,6
747007,2024-12-30 23:40:00,157.0075,157.0345,157.005,157.0335,1957769995689.39209,0.014153,74700,7
747008,2024-12-30 23:45:00,157.033,157.041,156.983,156.9955,919860003352.165283,0.009822,74700,8


In [107]:
expected_diff = pd.Timedelta("5min")  # 假設你要 5 分鐘間隔
is_continuous = (
    df_sequential.groupby("group")["timestamp"]
      .apply(lambda s: s.diff().dropna().eq(expected_diff).all())
)

valid_groups = is_continuous[is_continuous].index
# num_valid_group = len(valid_groups)
df_sequential = df_sequential[df_sequential["group"].isin(valid_groups)].copy()
# df_sequential['group'] = np.vstack([np.full((1, seq_len), i).reshape(10, 1) for i in range(0,num_valid_group)])
df_sequential

Unnamed: 0,timestamp,open,high,low,close,volume,spread,group,seq
0,2024-01-01 22:00:00,140.873,140.873,140.8655,140.8655,11999999761.581421,0.04575,0,0
1,2024-01-01 22:05:00,140.866,140.867,140.866,140.867,4200000047.683716,0.05,0,1
2,2024-01-01 22:10:00,140.87,140.895,140.8655,140.8655,38420000106.096268,0.034941,0,2
3,2024-01-01 22:15:00,140.866,140.8685,140.866,140.8685,10500000000.0,0.04925,0,3
4,2024-01-01 22:20:00,140.8705,140.8785,140.8675,140.8785,161620001435.279846,0.043016,0,4
...,...,...,...,...,...,...,...,...,...
747005,2024-12-30 23:30:00,157.028,157.052,156.987,157.0185,1337650005936.622559,0.010489,74145,5
747006,2024-12-30 23:35:00,157.016,157.016,156.961,157.0075,1602750017762.184082,0.011005,74145,6
747007,2024-12-30 23:40:00,157.0075,157.0345,157.005,157.0335,1957769995689.39209,0.014153,74145,7
747008,2024-12-30 23:45:00,157.033,157.041,156.983,156.9955,919860003352.165283,0.009822,74145,8


In [108]:
df_sequential["group"] = pd.factorize(df_sequential["group"])[0]
df_sequential

Unnamed: 0,timestamp,open,high,low,close,volume,spread,group,seq
0,2024-01-01 22:00:00,140.873,140.873,140.8655,140.8655,11999999761.581421,0.04575,0,0
1,2024-01-01 22:05:00,140.866,140.867,140.866,140.867,4200000047.683716,0.05,0,1
2,2024-01-01 22:10:00,140.87,140.895,140.8655,140.8655,38420000106.096268,0.034941,0,2
3,2024-01-01 22:15:00,140.866,140.8685,140.866,140.8685,10500000000.0,0.04925,0,3
4,2024-01-01 22:20:00,140.8705,140.8785,140.8675,140.8785,161620001435.279846,0.043016,0,4
...,...,...,...,...,...,...,...,...,...
747005,2024-12-30 23:30:00,157.028,157.052,156.987,157.0185,1337650005936.622559,0.010489,74145,5
747006,2024-12-30 23:35:00,157.016,157.016,156.961,157.0075,1602750017762.184082,0.011005,74145,6
747007,2024-12-30 23:40:00,157.0075,157.0345,157.005,157.0335,1957769995689.39209,0.014153,74145,7
747008,2024-12-30 23:45:00,157.033,157.041,156.983,156.9955,919860003352.165283,0.009822,74145,8
