In [28]:
import pandas as pd
import numpy as np
import h5py
from sklearn.preprocessing import MinMaxScaler

In [29]:
class Sample:
    def __init__(self, N, K, sliding_window = True):
        self.K = K
        self.N = N
        self.sliding_window = sliding_window
 
    def transform(self, A):
        M = self.N + self.K     #Number of samples per row (sample + target)
        #indexes
        if self.sliding_window:
            I = np.arange(M) + np.arange(A.shape[0] - M + 1).reshape(-1, 1)
        else:
            if A.shape[0]%M == 0:
                I = np.arange(M)+np.arange(0,A.shape[0],M).reshape(-1,1)
                
            else:
                I = np.arange(M)+np.arange(0,A.shape[0] -M,M).reshape(-1,1)
            
        B = A[I].reshape(-1, M * A.shape[1], A.shape[2])
        ci = self.N * A.shape[1]    #Number of features per sample
        return B[:, :ci], B[:, ci:] #Sample matrix, Target matrix

In [30]:
#data file path
dfp = 'data/BTCUSDT-1m-1Jun2020to1Dec2020data.csv'

#Columns of price data to use
columns = ['Close']
# df = pd.read_csv(dfp).dropna().tail(1000000)
df = pd.read_csv(dfp)
time_stamps = df['Timestamp']
df = df.loc[:,columns]
# original_df = pd.read_csv(dfp).dropna().tail(1000000).loc[:,columns]
original_df = pd.read_csv(dfp).loc[:,columns]

In [31]:
scaler = MinMaxScaler()
# normalization
for c in columns:
    df[c] = scaler.fit_transform(df[c].values.reshape(-1,1))

In [35]:
#%%Features are channels
A = np.array(df)[:,None,:]
original_A = np.array(original_df)[:,None,:]
time_stamps = np.array(time_stamps)[:,None,None]
#%%Make samples of temporal sequences of pricing data (channel)
NPS, NFS = 256, 16         #Number of past and future samples
ps = Sample(NPS, NFS, sliding_window=False)
B, Y = ps.transform(A)
input_times, output_times = ps.transform(time_stamps)
original_B, original_Y = ps.transform(original_A)

print(original_Y)

[[[ 9588.1 ]
  [ 9584.25]
  [ 9591.67]
  ...
  [ 9567.72]
  [ 9569.46]
  [ 9559.99]]

 [[ 9544.44]
  [ 9534.51]
  [ 9533.51]
  ...
  [ 9545.53]
  [ 9546.77]
  [ 9548.57]]

 [[ 9558.78]
  [ 9562.08]
  [ 9560.3 ]
  ...
  [ 9560.25]
  [ 9561.68]
  [ 9561.01]]

 ...

 [[18517.85]
  [18525.69]
  [18520.  ]
  ...
  [18600.62]
  [18602.37]
  [18596.34]]

 [[19695.82]
  [19710.05]
  [19675.08]
  ...
  [19490.02]
  [19476.37]
  [19482.72]]

 [[19212.6 ]
  [19259.98]
  [19177.79]
  ...
  [19381.73]
  [19366.44]
  [19382.49]]]


In [33]:

with h5py.File('file_name', 'w') as f:
    f.create_dataset("inputs", data = B)
    f.create_dataset('outputs', data = Y)
    f.create_dataset("input_times", data = input_times)
    f.create_dataset('output_times', data = output_times)
    f.create_dataset("original_datas", data=np.array(original_df))
    f.create_dataset('original_inputs',data=original_B)
    f.create_dataset('original_outputs',data=original_Y)
    print(f['outputs'])
#     f.create_dataset('original_times', data=time_stamps)

<HDF5 dataset "outputs": shape (967, 16, 1), type "<f8">


In [37]:
print(B)

[[[0.05272414]
  [0.05294818]
  [0.05304289]
  ...
  [0.06523578]
  [0.06549261]
  [0.06669478]]

 [[0.06359373]
  [0.063489  ]
  [0.06332507]
  ...
  [0.06256187]
  [0.0623269 ]
  [0.06249812]]

 [[0.06268938]
  [0.0625983 ]
  [0.06231689]
  ...
  [0.06332507]
  [0.06257371]
  [0.06332689]]

 ...

 [[0.88523402]
  [0.88464295]
  [0.88527955]
  ...
  [0.88023409]
  [0.87946088]
  [0.87898275]]

 [[0.88589066]
  [0.88716022]
  [0.88868023]
  ...
  [0.98774245]
  [0.98518056]
  [0.98133818]]

 [[0.97337746]
  [0.97050774]
  [0.96718812]
  ...
  [0.94541797]
  [0.94181602]
  [0.94298085]]]
