# LIBRARIES

In [None]:
######################################################## LIBRARIES ########################################################
import pandas as pd

import numpy as np

import os

import CONSTANTS as c

# CONSTANTS

In [10]:
######################################################## CONSTANTS ########################################################
PREPROCESSED_DATA_PATH = c.PREPROCESSED_DATA_PATH
FEATURE_ENGINEERED_DATA_PATH = c.FEATURE_ENGINEERED_DATA_PATH

NR_OF_BOUNDARIES = c.NR_OF_BOUNDARIES
BIN_BOUNDARIES = c.BIN_BOUNDARIES
SIZE_OF_CHUNK = c.SIZE_OF_CHUNK

DISERT_DATA_PATH = c.DISERT_DATA_PATH

# DATA SOURCE

In [1]:
######################################################## DATA SOURCE ########################################################
i = 0
for oFile in os.walk(PREPROCESSED_DATA_PATH):
    sFolderPath = oFile[0]
    
    if sFolderPath != PREPROCESSED_DATA_PATH:
        sSubFolder = sFolderPath.split('\\')[-1]
        
        sFolderToWrite = r'{}\{}'.format(FEATURE_ENGINEERED_DATA_PATH, sSubFolder)
        
        if os.path.exists(sFolderToWrite) == False:
            os.makedirs(sFolderToWrite)
        
        if all(elem in os.listdir(sFolderToWrite)  for elem in  ['X_ORIGINAL.npy', 'X_ORIGINAL.npy', 'X_ORIGINAL.npy']) == True:
            continue

            
        X_ORIGINAL = np.load(r'{}\X_ORIGINAL.npy'.format(sFolderPath))
        X_TIME_ORIGINAL = np.load(r'{}\X_TIME_ORIGINAL.npy'.format(sFolderPath))
        Y_ORIGINAL = np.load(r'{}\Y_ORIGINAL.npy'.format(sFolderPath))
        
        
        if i ==0:
            X = X_ORIGINAL
            Y = Y_ORIGINAL
            X_TIME = X_TIME_ORIGINAL
        else:
            X = np.concatenate([X, X_ORIGINAL])
            X_TIME = np.concatenate([X_TIME, X_TIME_ORIGINAL])
            Y = np.concatenate([Y, Y_ORIGINAL])            
        
        i = i +1
        
        if i == 3:
            break

            


X = X[:,:,:,0]
Y = Y[:,:,:,0]


# revert back to prices from change_rates
sExchangeRate = 'EURUSD'
df = pd.read_csv(r'Data\{}_M1_202010010001_202210312359.csv'.format(sExchangeRate), delimiter = '\t')
df.loc[:, 'PRICE_TIME_STAMP'] = pd.to_datetime(df['<DATE>'] + df['<TIME>'], format='%Y.%m.%d%H:%M:%S')
df.drop(['<DATE>', '<TIME>'], axis = 1, inplace = True)
df.loc[:, 'EXCHANGE_RATE'] = sExchangeRate


dfTime = pd.DataFrame(
    data = X_TIME,
    columns = ['MINUTE', 'HOUR', 'DAY_OF_WEEK', 'DAY_OF_MONTH', 'MONTH', 'YEAR'])

dfTime['TIME_STAMP'] = pd.to_datetime(
    dfTime['YEAR'].astype(str) \
    + '-' + dfTime['MONTH'].astype(str) \
    + '-' + dfTime['DAY_OF_MONTH'].astype(str) \
    + ' ' + dfTime['HOUR'].astype(str) \
    + ':' + dfTime['MINUTE'].astype(str) , 
    format='%Y-%m-%d %H:%M:%S')

dfTime = dfTime[['TIME_STAMP']]


aClosePrices = dfTime.merge(
    right = df[['PRICE_TIME_STAMP','<CLOSE>']], 
    left_on = 'TIME_STAMP',
    right_on = 'PRICE_TIME_STAMP',
    how = 'inner'
)[['<CLOSE>']].values
aClosePrices = np.expand_dims(aClosePrices, 2)


X = (X * aClosePrices) + aClosePrices
Y = (Y * aClosePrices) + aClosePrices

X = X.reshape(X.shape[0] , -1)
Y = Y.reshape(Y.shape[0] , -1)


# XTIME consists of minute, hour, day of week, day of month, month, year
# exclude month and year.
X_TIME = X_TIME[:, :4]

X_SOURCE = X.copy()
Y_SOURCE = Y.copy()
X_TIME_SOURCE = X_TIME.copy()

# ANALYSIS

In [5]:
######################################################## ANALYSIS ########################################################
import tensorflow as tf

class DisERT(tf.Module):
    def __init__(
        self,
        p_bin_boundaries,
        p_size_of_chunk
    ):
        super(DisERT, self).__init__()
        
        self.size_of_chunk = p_size_of_chunk
        
        aLayers = [
            tf.keras.layers.Reshape((-1, p_size_of_chunk)),
            tf.keras.layers.TimeDistributed(
                tf.keras.layers.Discretization(
                    bin_boundaries = p_bin_boundaries,
                    output_mode = 'count'
                )
            ),
            tf.keras.layers.Rescaling(scale = 1/p_size_of_chunk) # could be applied softmax as well.
        ]
        
        self.InputBinDistributionByChunks = tf.keras.Sequential(aLayers)       


    # x should be in format (nr_of_samples, time_step, nr_of_features)
    def mask(self, x, p_masking_rate = 0.20, p_mask_value = 0.555):
        # mask only time steps...
        mask = np.random.rand(x.shape[0],x.shape[1]) < p_masking_rate

        # Set targets to -1 by default, it means ignore
        y_mlm =  -1 * np.ones(x.shape)

        # Set labels for masked tokens
        y_mlm[mask] = x[mask]
        
        # set masked mlm_input
        x_mlm = np.copy(x)
        x_mlm[mask] =  0.555 * np.ones(x.shape[2])
        
        return x_mlm, y_mlm
    
    
    # x: represents the chunked values with the shape (nr_of_samples, time_steps (chunks), feature_size )
    # y: represents the class for next sentence prediction with the shape (nr_of_sampples, 1)
    def inject_noise(self, x, y):
        # we will roll the bins
        x_backward = x[:, :self.backward_nr_of_chunks,:]
        x_forward = x[:, self.backward_nr_of_chunks:,:]
        
        x_forward_noise = tf.random.shuffle(x_forward)
        
        # if still same forecast chunks comes after shuffling (even thogh small percentage), we are going to set nsp class to 1.
        y_forward_noise = tf.cast(tf.math.reduce_all(x_forward_noise == x_forward, axis = (1, 2)), dtype = tf.dtypes.float64)
        
        y_noise = tf.expand_dims(y_forward_noise,  axis = 1)
        
        x_noise = tf.keras.layers.concatenate([x_backward,x_forward_noise ], axis = 1)
        
        y = tf.keras.layers.concatenate([y, y_noise], axis = 0)
        x = tf.keras.layers.concatenate([x, x_noise], axis = 0)
                
        return x, y
    
    
    def shuffle(self, x_mlm, y_mlm, x_nsp, y_nsp):
        indices = tf.range(start=0, limit=tf.shape(x_mlm)[0], dtype=tf.int32)
        
        idx = tf.random.shuffle(indices)
        
        x_mlm = tf.gather(x_mlm, idx)
        y_mlm = tf.gather(y_mlm, idx)
        x_nsp = tf.gather(x_nsp, idx)
        y_nsp = tf.gather(y_nsp, idx)
        
        
        return x_mlm, y_mlm, x_nsp, y_nsp
        
    # x: represents the lagged backcast values with the shape (nr_of_samples, time_steps) 
    # y: represents the lagged forecast values with the shape (nr_of_samples, time_steps) 
    def preprocess(self, x, y):
        
        self.backward_time_steps = x.shape[1]
        self.forward_time_steps = y.shape[1]
        
        self.backward_nr_of_chunks = int(self.backward_time_steps/self.size_of_chunk)
        self.forward_nr_of_chunks = int(self.forward_time_steps/self.size_of_chunk)
        
        # for masked language model, both x and y are concatted.
        x_mlm = tf.keras.layers.concatenate(
            [x, y], axis = 1
        )
        
        x_mlm = self.InputBinDistributionByChunks(x_mlm)
        
        y_nsp = tf.ones([x_mlm.shape[0], 1])
        
        x_mlm, y_nsp = self.inject_noise(x_mlm, y_nsp)
        
        x_mlm, y_mlm = self.mask(x_mlm)
        
        x_mlm = tf.convert_to_tensor(x_mlm)
        y_mlm = tf.convert_to_tensor(y_mlm)

        x_nsp = tf.ones([x_mlm.shape[0], 1, x_mlm.shape[2]]) * 0.5 # 0.5 acts as CLS
        
        x_mlm, y_mlm, x_nsp, y_nsp = self.shuffle(x_mlm, y_mlm, x_nsp, y_nsp)
        
            
        return x_mlm, y_mlm, x_nsp, y_nsp


X = X_SOURCE.copy()
Y =  Y_SOURCE.copy()

# Limits of BIN_BOUNDARIES should be calculated dynamically. Otherwise, it creates too sparse input dataset which may not be good for learning.
# Idea #1: to use daily minimum and maximum...
# Idea #2: Keep it sparse however, apply FFT to get differences...
# Idea #3: We can have several more BIN_BOUNDARIES variables...
# I think, Idea #1 is more secure... However, we need to build higher scale logic...
# Higher scale can be part of any X_TIME feature.
# However, we must keep, NR_OF_BOUNDARIES constant
oDisERT = DisERT(
        p_bin_boundaries = BIN_BOUNDARIES,
        p_size_of_chunk = SIZE_OF_CHUNK
)
X_MLM, Y_MLM, X_NSP, Y_NSP = oDisERT.preprocess(X, Y)

# SAVE

In [16]:
if os.path.exists(DISERT_DATA_PATH) == False:
    os.makedirs(DISERT_DATA_PATH)
    
np.save(f'{DISERT_DATA_PATH}\X_MLM.npy', X_MLM)
np.save(f'{DISERT_DATA_PATH}\Y_MLM.npy', Y_MLM)
np.save(f'{DISERT_DATA_PATH}\X_NSP.npy', X_NSP)
np.save(f'{DISERT_DATA_PATH}\Y_NSP.npy', Y_NSP)