In [1]:
import dask
from dask.distributed import Client, progress
from dask import delayed
import dask.dataframe as dd
import dask.array as da
import pandas as pd 
import numpy as np
from datetime import datetime, timedelta, date
from scipy import signal
from dask.dataframe.utils import make_meta 

In [2]:
client = Client(n_workers=8, threads_per_worker=2, memory_limit='1GB')
client

0,1
Client  Scheduler: tcp://127.0.0.1:60612  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 8  Cores: 16  Memory: 7.45 GiB


In [3]:
TRAIN_SIZE_CRITICAL_POINTS = .8 #percentage of MIN and MAX data devoted to training
NEITHER_SIZE_MULTIPLE = 10 #Size of the non-critical point data set in multiples of the critical-point dataset size (i.e. for undersampling the non-critical point dataset)
TRAIN_TEST_SPLIT__MIN_WINDOW_SIZE = 120

In [4]:
data_resampled_dask = dd.read_csv("DataFromDeepLearningProcessing/DOGE_Deep_2021-04-16.csv")

In [5]:
data_resampled_dask.columns

Index(['datetime', 'mark_price', 'ask_price', 'bid_price', 'spread',
       'mark_price_10T_velocity', 'mark_price_60T_velocity',
       'mark_price_1440T_velocity', 'mark_price_10T_mean',
       'mark_price_60T_mean', 'mark_price_1440T_mean', 'mark_price_10T_std',
       'mark_price_60T_std', 'mark_price_1440T_std',
       'mark_price_10T_acceleration_for_10T_velocity',
       'mark_price_60T_acceleration_for_60T_velocity', 'datetimeNotTheIndex',
       'min', 'max', 'minmax'],
      dtype='object')

In [6]:
# data_resampled_dask = dd.read_csv("DOGE_Deep_2021-04-16.csv")
data_resampled_dask["datetime"] = dd.to_datetime(data_resampled_dask["datetime"])
data_resampled_dask["datetimeNotTheIndex"] = dd.to_datetime(
    data_resampled_dask["datetimeNotTheIndex"])

data_resampled_dask = data_resampled_dask.set_index("datetime", sorted=True)

training_columns = ['mark_price', 'ask_price', 'bid_price', 'spread',
                    'mark_price_10T_velocity', 'mark_price_60T_velocity',
                    'mark_price_1440T_velocity', 'mark_price_10T_mean',
                    'mark_price_60T_mean', 'mark_price_1440T_mean', 'mark_price_10T_std',
                    'mark_price_60T_std', 'mark_price_1440T_std',
                    'mark_price_10T_acceleration_for_10T_velocity',
                    'mark_price_60T_acceleration_for_60T_velocity', "minmax"]

data_resampled_dask = data_resampled_dask[training_columns]

In [7]:
print("Velocity Max ")
print("10T Max: ", data_resampled_dask.mark_price_10T_velocity.max().compute(), "10T Min: ",  data_resampled_dask.mark_price_10T_velocity.min().compute())
print(data_resampled_dask.mark_price_60T_velocity.max().compute())
print(data_resampled_dask.mark_price_1440T_velocity.max().compute())

Velocity Max 
10T Max:  0.0045303177106794 10T Min:  -0.0054550717149153
0.0014198635744729
0.0001493711528956


In [8]:
print("Acceleration Min/Max")
print("10T Max: ", data_resampled_dask.mark_price_10T_acceleration_for_10T_velocity.max().compute(), "10T Min: ",  data_resampled_dask.mark_price_10T_acceleration_for_10T_velocity.min().compute())
print("60T Max: ", data_resampled_dask.mark_price_60T_acceleration_for_60T_velocity.max().compute(), "60T Min: ",  data_resampled_dask.mark_price_60T_acceleration_for_60T_velocity.min().compute())

Acceleration Min/Max
10T Max:  0.000816395032366 10T Min:  -0.0007903738584578
60T Max:  2.152452744598861e-05 60T Min:  -3.52456671819301e-05


In [9]:
print("STD Min/Max")
print("10T Max: ", data_resampled_dask.mark_price_10T_std.max().compute(), "10T Min: ",  data_resampled_dask.mark_price_10T_std.min().compute())
print("60T Max: ", data_resampled_dask.mark_price_60T_std.max().compute(), "60T Min: ",  data_resampled_dask.mark_price_60T_std.min().compute())
print("1440T Max: ", data_resampled_dask.mark_price_1440T_std.max().compute(), "1440T Min: ",  data_resampled_dask.mark_price_1440T_std.min().compute())

STD Min/Max
10T Max:  0.0185593805333707 10T Min:  0.0
60T Max:  0.0272826194257299 60T Min:  4.3099397711767675e-11
1440T Max:  0.0466893674281633 1440T Min:  2.742964702065232e-06


In [10]:
min_indices = np.array(data_resampled_dask[data_resampled_dask["minmax"] == 1].index.compute().tolist())
max_indices = np.array(data_resampled_dask[data_resampled_dask["minmax"] == 2].index.compute().tolist())
neither_indices = np.array(data_resampled_dask[data_resampled_dask["minmax"] == 0].index.compute().tolist())

In [11]:
X = data_resampled_dask.drop("minmax", axis=1).to_dask_array().compute()
Y = data_resampled_dask["minmax"].to_dask_array().compute()

In [12]:
len(Y)

559972

In [13]:
#The level 1 train/test split should NOT be shuffled! We want to test the final model on unshuffled data
SpliteIndex = int(len(Y)*TRAIN_SIZE_CRITICAL_POINTS)
X_train_level_1 = X[:SpliteIndex, :]
Y_train_level_1 = Y[:SpliteIndex]

X_test_level_1 = da.from_array(X[SpliteIndex:, :])
Y_test_level_1 = da.from_array(Y[SpliteIndex:])

print("Train Size: " + str(len(Y_train_level_1)))
print("Test Size: " + str(len(Y_test_level_1)))

Train Size: 447977
Test Size: 111995


In [14]:
X_train_level_1
Y_train_level_1

array([0, 0, 2, ..., 0, 0, 0], dtype=int64)

In [15]:
minIndices = np.where(Y_train_level_1 == 1)[0]
minIndices = minIndices[np.where(minIndices >= TRAIN_TEST_SPLIT__MIN_WINDOW_SIZE)] #only take indices above TRAIN_TEST_SPLIT__MIN_WINDOW_SIZE (i.e. min window size). Also remove final index, if present, since cannot build the window off that. Also remove final index, if present, since cannot build the window off that

maxIndices = np.where(Y_train_level_1 == 2)[0]
maxIndices = maxIndices[np.where(maxIndices >= TRAIN_TEST_SPLIT__MIN_WINDOW_SIZE)]   #Only take indices above TRAIN_TEST_SPLIT__MIN_WINDOW_SIZE (i.e. min window size). Also remove final index, if present, since cannot build the window off that

neitherIndices = np.where(Y_train_level_1 == 0)[0]
neitherIndices = neitherIndices[np.where(neitherIndices >= TRAIN_TEST_SPLIT__MIN_WINDOW_SIZE)] #Only take indices above TRAIN_TEST_SPLIT__MIN_WINDOW_SIZE (i.e. min window size). Also remove final index, if present, since cannot build the window off that
                                    

In [16]:
print(type(minIndices))
print(len(minIndices))
print(len(maxIndices))
print(len(neitherIndices))

<class 'numpy.ndarray'>
3709
3710
440438


In [17]:
np.random.shuffle(minIndices)
np.random.shuffle(maxIndices)
np.random.shuffle(neitherIndices)

In [18]:
minIndices

array([104912, 210788,  98645, ..., 122714, 408009, 290708], dtype=int64)

In [19]:
def trainTestSplitAClass(indexArray, splitIndex, endIndex):
    print(splitIndex, endIndex)
    if(np.isnan(endIndex)):
        Train= indexArray[:splitIndex]
        Test = indexArray[splitIndex:]
    else:
        Train= indexArray[:splitIndex]
        Test = indexArray[splitIndex:endIndex]
    print("Train Size: " + str(len(Train)))
    print("Test Size: " + str(len(Test)))

    return Train, Test

In [20]:
train_list = []
test_list = []

minSplitIndex = int(len(minIndices)*TRAIN_SIZE_CRITICAL_POINTS)
maxSplitIndex = int(len(maxIndices)*TRAIN_SIZE_CRITICAL_POINTS)
neitherEndIndex = int((minSplitIndex+maxSplitIndex)*NEITHER_SIZE_MULTIPLE)
neitherSplitIndex = int(neitherEndIndex*TRAIN_SIZE_CRITICAL_POINTS)
for theIndexArray, splitIndex, endIndex in zip(
    [minIndices, maxIndices, neitherIndices],
    [minSplitIndex, maxSplitIndex, neitherSplitIndex], 
    [np.nan, np.nan, neitherEndIndex]):
    print("array len: " + str(len(theIndexArray)))
    print("splitIndex: " + str(splitIndex))
    trainTemp, testTemp = trainTestSplitAClass(theIndexArray, splitIndex, endIndex)

    train_list.append(trainT  emp)
    test_list.append(testTemp)
    print("")

TrainIndices = np.append(train_list[0], train_list[1], axis=0)
TrainIndices = np.append(TrainIndices, train_list[2], axis=0)
np.random.shuffle(TrainIndices)

TestIndices = np.append(test_list[0], test_list[1], axis=0)
TestIndices = np.append(TestIndices, test_list[2], axis=0)
np.random.shuffle(TestIndices)

array len: 3709
splitIndex: 2967
2967 nan
Train Size: 2967
Test Size: 742

array len: 3710
splitIndex: 2968
2968 nan
Train Size: 2968
Test Size: 742

array len: 440438
splitIndex: 47480
47480 59350
Train Size: 47480
Test Size: 11870



In [24]:



numWindowsTrain = len(TrainIndices)
numColumns = X.shape[1]
X_Train = np.zeros([numWindowsTrain, TRAIN_TEST_SPLIT__MIN_WINDOW_SIZE, numColumns])
Y_Train = np.empty(shape=numWindowsTrain, dtype=np.int32)
for  index, windowIndex in zip(range(0, len(TrainIndices)), TrainIndices):
    #The final row of of the window needs to include windowIndex, so build the start and stop indices accordingly
    windowStartIndex = windowIndex - TRAIN_TEST_SPLIT__MIN_WINDOW_SIZE + 1
    
    #If the windowIndex happens to the be final index in the X array, then handle that situation
    if(windowIndex == len(X)-1):
        X_Train[index] = X[windowStartIndex:, :]
    else: #This is normal situation (i.e. windowIndex is not the final index in the array)
        windowEndIndex = windowIndex + 1
        X_Train[index] = X[windowStartIndex:windowEndIndex, :]
    Y_Train[index] = Y[windowIndex] #Take hte label from The final row in the window

    
    data = X_Train[windowIndex, :, :]
    data_normalized = (data - data.min()) / (data.max() - data.min())
    X_Train[windowIndex, :, :] = data_normalized
    
    print()
    print("data shape: " + str(data.shape))
    print("data normalized shape: " + str(data_normalized.shape))
    print(data)
    print(data_normalized)


X_Train = da.from_array(X_Train) 
Y_Train = da.from_array(Y_Train)


data shape: (120, 15)
data normalized shape: (120, 15)
[[nan nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]
 ...
 [nan nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]]
[[nan nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]
 ...
 [nan nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]]
data shape: (120, 15)
data normalized shape: (120, 15)
[[nan nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]
 ...
 [nan nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]]
[[nan nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]
 ...
 [nan nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]]
  data_normalized = (data - data.min()) / (data.max() - data.min())


IndexError: index 273915 is out of bounds for axis 0 with size 53415

numWindowsTest = len(TestIndices)
numColumns = X.shape[1]
X_Test = np.zeros([numWindowsTest, TRAIN_TEST_SPLIT__MIN_WINDOW_SIZE, numColumns])
Y_Test = np.empty(shape=numWindowsTest, dtype=np.int32)
for index, windowIndex in zip(range(0, len(TestIndices)), TestIndices):
    #The final row of of the window needs to include windowIndex, so build the start and stop indices accordingly
    windowStartIndex = windowIndex - TRAIN_TEST_SPLIT__MIN_WINDOW_SIZE + 1
    
    #If the windowIndex happens to the be final index in the X array, then handle that situation
    if(windowIndex == len(X)-1):
        X_Test[index] = X[windowStartIndex:, :]
    else: #This is normal situation (i.e. windowIndex is not the final index in the array)
        windowEndIndex = windowIndex + 1
        X_Test[index] = X[windowStartIndex:windowEndIndex, :]
    Y_Test[index] = Y[windowIndex] #Take the label from The final row in the window

X_Test = da.from_array(X_Test)
Y_Test = da.from_array(Y_Test)

In [39]:
print(X_Train.shape)
print(Y_Train.shape)
print(X_Test.shape)
print(Y_Test.shape)
print(type(X_Train))

(53415, 120, 15)
(53415,)
(13354, 120, 15)
(13354,)
<class 'dask.array.core.Array'>


In [40]:
dir = "DataFromDeepLearningProcessing/DODGE/X_Train"
if(not os.path.exists(dir)):
            os.makedirs(dir)
da.to_npy_stack(dir, X_Train, axis=0)

dir = "DataFromDeepLearningProcessing/DODGE/Y_Train"
if(not os.path.exists(dir)):
            os.makedirs(dir)
da.to_npy_stack(dir, Y_Train, axis=0)

In [41]:
dir = "DataFromDeepLearningProcessing/DODGE/X_Test"
if(not os.path.exists(dir)):
            os.makedirs(dir)
da.to_npy_stack(dir, X_Test, axis=0)

dir = "DataFromDeepLearningProcessing/DODGE/Y_Test"
if(not os.path.exists(dir)):
            os.makedirs(dir)
da.to_npy_stack(dir, Y_Test, axis=0)

In [48]:
dir = "DataFromDeepLearningProcessing/DODGE/X_Test_Level_1"
if(not os.path.exists(dir)):
            os.makedirs(dir)
da.to_npy_stack(dir, X_test_level_1, axis=0)

In [49]:
dir = "DataFromDeepLearningProcessing/DODGE/Y_Test_Level_1"
if(not os.path.exists(dir)):
            os.makedirs(dir)
da.to_npy_stack(dir, Y_test_level_1, axis=0)