# Multiple Variable Wavelet Preprocessing

In [1]:
%load_ext autoreload
%autoreload 2
p = print

import os

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

from scipy import signal
import pywt

from crypr.util import get_project_path
from crypr.build import make_features, data_to_supervised

In [2]:
SYM = 'BTC'
TARGET = 'close'
Tx = 72
Ty = 1
TEST_SIZE = 0.05

data_path = os.path.join(get_project_path(), 'data', 'raw', SYM + '.csv')
data = pd.read_csv(data_path, index_col=0)
data.head()

Unnamed: 0,volumeto,volumefrom,open,high,close,low,time,timestamp
0,6935520.93,1096.93,6314.14,6322.24,6313.66,6290.84,1530471600,2018-07-01 21:00:00
1,24714923.33,3879.19,6313.94,6378.28,6347.37,6298.49,1530475200,2018-07-01 22:00:00
2,9517750.88,1496.49,6347.37,6359.64,6355.81,6331.35,1530478800,2018-07-01 23:00:00
3,12533312.19,1965.11,6355.81,6371.16,6355.41,6346.47,1530482400,2018-07-02 00:00:00
4,13213908.33,2074.03,6355.41,6371.17,6339.04,6337.97,1530486000,2018-07-02 01:00:00


In [3]:
"""
Get percent change feature and target data.
"""
df = make_features(input_df=data, target_col='close', moving_average_lags=[])
X, y = data_to_supervised(input_df=df, Tx=Tx, Ty=Ty)
p(X.shape, y.shape)
X.head()

(5926, 504) (5926, 1)


Unnamed: 0,var1(t-72),var2(t-72),var3(t-72),var4(t-72),var5(t-72),var6(t-72),var7(t-72),var1(t-71),var2(t-71),var3(t-71),...,var5(t-2),var6(t-2),var7(t-2),var1(t-1),var2(t-1),var3(t-1),var4(t-1),var5(t-1),var6(t-1),var7(t-1)
73,6313.94,6378.28,6347.37,6298.49,24714923.33,3879.19,0.533922,6347.37,6359.64,6355.81,...,13007702.37,1938.1,-0.045374,6674.92,6688.86,6687.7,6671.52,9378691.92,1399.77,0.192814
74,6347.37,6359.64,6355.81,6331.35,9517750.88,1496.49,0.132968,6355.81,6371.16,6355.41,...,9378691.92,1399.77,0.192814,6687.78,6696.86,6693.25,6686.64,8424503.46,1254.9,0.082988
75,6355.81,6371.16,6355.41,6346.47,12533312.19,1965.11,-0.006293,6355.41,6371.17,6339.04,...,8424503.46,1254.9,0.082988,6693.36,6697.32,6694.09,6673.0,11276851.07,1681.92,0.01255
76,6355.41,6371.17,6339.04,6337.97,13213908.33,2074.03,-0.257576,6342.69,6348.29,6334.51,...,11276851.07,1681.92,0.01255,6694.09,6697.69,6598.38,6562.33,27926930.49,4202.29,-1.429769
77,6342.69,6348.29,6334.51,6316.0,12405554.35,1954.61,-0.071462,6334.17,6340.97,6324.64,...,27926930.49,4202.29,-1.429769,6596.86,6605.67,6590.06,6567.87,14978437.33,2267.63,-0.126092


In [4]:
"""
Confirm data reshape and target/feature creation was done correctly.
"""
y_values_except_last = np.squeeze(y.iloc[:-1].values)
t_minus_1_x_values_except_first = X.iloc[1:,-1].values

y_values_except_last.all() == t_minus_1_x_values_except_first.all()

True

In [5]:
"""
For comparing different transformations
"""
sample_ix = 1000

In [6]:
"""
Reshape the data into 3d array if multiple variables.
"""
X = X.values.reshape((X.shape[0], -1, Tx))
p(X.shape)

(5926, 7, 72)


In [7]:
"""
Apply the wave transformation to the feature data.
"""
wt_type = 'DWT_HAAR'
p('Applying {} transform ...'.format(wt_type))

if wt_type == 'RICKER':
    wt_transform_fun = lambda x: signal.cwt(x, wavelet=signal.ricker, widths=widths)
elif wt_type == 'HAAR':
    wt_transform_fun = lambda x: Haar(x).getpower()
elif wt_type == 'DWT_HAAR':
    wt_transform_fun = lambda x: np.stack(pywt.dwt(x, 'haar'))
else:
    raise NotImplementedError
    
X_wt = np.apply_along_axis(func1d=wt_transform_fun, axis=-1, arr=X)

X_wt.shape

Applying DWT_HAAR transform ...


(5926, 7, 2, 36)

In [8]:
"""
Condense wavelet features if multiple features analyzed.
"""
X_wt = X_wt.reshape((X_wt.shape[0], X_wt.shape[1]*X_wt.shape[2], X_wt.shape[-1]))
N = X_wt.shape[-2:]
X_wt.shape, N

((5926, 14, 36), (14, 36))

In [9]:
"""
Reshape the data so Tx is the 2nd dimension.
"""
X_wt_rs = X_wt.swapaxes(-1,-2)
p(X_wt_rs.shape)

(5926, 36, 14)


In [10]:
"""
Train Test Split.
"""
X_train, X_test, y_train, y_test = train_test_split(X_wt_rs, y, test_size=TEST_SIZE, shuffle=False)

In [11]:
"""
Save data.
"""
output_dir = os.path.join(get_project_path(), 'data', 'processed')

np.save(arr=X_train, allow_pickle=True, 
        file=os.path.join(output_dir, '.X_train_{}_{}_{}x{}'.format(SYM, wt_type, Tx, N)))
np.save(arr=X_test, allow_pickle=True, 
        file=os.path.join(output_dir, 'X_test_{}_{}_{}x{}'.format(SYM, wt_type, Tx, N)))
np.save(arr=y_train, allow_pickle=True, 
        file=os.path.join(output_dir, 'y_train_{}_{}_{}x{}'.format(SYM, wt_type, Tx, N)))
np.save(arr=y_test, allow_pickle=True, 
        file=os.path.join(output_dir, 'y_test_{}_{}_{}x{}'.format(SYM, wt_type, Tx, N)))