In [1]:
import numpy as np
import pandas as pd
from ta import add_all_ta_features
from ta.utils import dropna
from pathlib import Path
import os
from sklearn.preprocessing import MinMaxScaler

In [2]:
import matplotlib.pyplot as plt
import matplotlib as mpl
mpl.style.use('seaborn')

from pyts.image import GramianAngularField

In [3]:
def load_data(root=None, remove=None, headers=None):
    data = Path('./OmegaDev/Model_Z/raw_data/GBP_USD_H1_2016-01-01_2018-01-01.csv') if root is None else Path(root)
    remove = ['trend_psar_up', 'trend_psar_down'] if remove is None else remove
    headers = ['date', 'complete', 'open', 'high', 'low', 'close', 'volume'] if headers is None else headers
    
    df = pd.read_csv(data, header=None, names=headers)
    df = dropna(df)
    df = add_all_ta_features(df, 'open', 'high', 'low', 'close', 'volume')
    df.drop(columns=remove,axis=1,inplace=True)
    df.dropna(inplace=True)
    
    data = df.iloc[:,2:].copy()
    
    return df, data

In [4]:
df, data = load_data()

  dip[i] = 100 * (self._dip[i]/self._trs[i])
  din[i] = 100 * (self._din[i]/self._trs[i])


## Labeling
Performing three-barrier-method for labeling. 

In [5]:
data.head(3)

Unnamed: 0,open,high,low,close,volume,volume_adi,volume_obv,volume_cmf,volume_fi,momentum_mfi,...,momentum_uo,momentum_stoch,momentum_stoch_signal,momentum_wr,momentum_ao,momentum_kama,momentum_roc,others_dr,others_dlr,others_cr
51,1.46722,1.46802,1.4664,1.46668,1030,-2100.361678,1275,-0.165557,-0.129598,70.874071,...,44.879637,49.477352,56.736353,-50.522648,-0.001714,1.467623,-0.091279,-0.033398,-0.033403,-0.465546
52,1.46674,1.46722,1.46588,1.46588,685,-2785.361678,590,-0.216499,-0.18937,65.160717,...,39.18804,35.54007,47.67712,-64.45993,-0.001827,1.46761,-0.051819,-0.054545,-0.05456,-0.519837
53,1.46589,1.46682,1.46549,1.46619,512,-2758.41431,1102,-0.196579,-0.139643,59.277433,...,38.052418,40.940767,41.986063,-59.059233,-0.002015,1.467604,0.094894,0.021148,0.021145,-0.498799


## Scaling ```MinMaxScaling```

In [6]:
def scale_data(data, y_col=None, non_float_cols=None, scaler=None):
    """
    args:
        y_col: dependent var if y_col is in the master dataframe. 
        non_float_cols: this can be discreete vars that was will not use for GASF or GADF conversion - such as metadata. This data should be seperated
        scaler: scaler to use: example: MinMaxScaler(feature_range(n0, n-1)), RobustScaler(), etc
    """
    data = data.copy()
    cols_to_ignore = [y_col] + non_float_cols if isinstance(non_float_cols, list) else [y_col] + [non_float_cols]
    scaler = MinMaxScaler(feature_range=(0,1)) if scaler is None else scaler
    cols_to_scale = [c for c in list(data.columns) if c not in cols_to_ignore]
    for c in cols_to_scale:
        if data.iloc[:][c].dtype!=np.float64: data[c] = data.iloc[:][c].astype('float64')
        dd = data.iloc[:][c].values
        dd = dd[:, None]
        sc = scaler.fit(dd)
        data.iloc[:][c] = sc.transform(dd).squeeze(1)
    return data

In [7]:
data_dd = scale_data(data)

In [8]:
data_dd.head()

Unnamed: 0,open,high,low,close,volume,volume_adi,volume_obv,volume_cmf,volume_fi,momentum_mfi,...,momentum_uo,momentum_stoch,momentum_stoch_signal,momentum_wr,momentum_ao,momentum_kama,momentum_roc,others_dr,others_dlr,others_cr
51,0.888419,0.886621,0.933135,0.885927,0.011638,0.588008,0.392564,0.355333,0.973108,0.727863,...,0.41536,0.494774,0.566134,0.494774,0.794071,0.950882,0.766721,0.719473,0.727505,0.885927
52,0.886823,0.883944,0.931526,0.883247,0.007736,0.585924,0.391663,0.308299,0.973064,0.667834,...,0.328004,0.355401,0.474361,0.355401,0.793184,0.950837,0.769888,0.716799,0.724878,0.883247
53,0.883995,0.882605,0.930318,0.884286,0.005779,0.586006,0.392336,0.326691,0.973101,0.60602,...,0.310574,0.409408,0.416709,0.409408,0.791716,0.950815,0.781661,0.72637,0.734277,0.884286
54,0.885192,0.883609,0.93168,0.884118,0.002975,0.585487,0.391989,0.337846,0.973114,0.671127,...,0.264585,0.525114,0.426954,0.525114,0.790924,0.950764,0.774867,0.723265,0.731228,0.884118
55,0.884627,0.880463,0.929978,0.88231,0.002975,0.585125,0.391641,0.385122,0.973112,0.743398,...,0.236755,0.272222,0.398867,0.272222,0.789822,0.950492,0.770762,0.719039,0.727079,0.88231


## Splitting Data
We will not begin to split the data into 'windows'. One important thing to note is that each window should be the same window used to train the model - this should be the sequence step we will use for the RNN/Transformer architecture. 

We will take this window of size n and slide it all the way till the last point in which the window fits. This will build the training data for c_col, that is for that specific columns. 

We will do this for individual columns.

This data will then be used to create GADFs or GASFs which will be indexed in some manner through our Dataset object. 

In [164]:
"""
Example
    window_size: how many time steps
"""
sample = data_dd['open']
ddd = {'open':[]}

window_size = 24
n = len(data_dd)

start_idx = 0
end_idx = window_size
last_idx = n - window_size

while start_idx <= last_idx:
    win = sample.iloc[start_idx:end_idx].values
    ddd['open'].append(win)
    start_idx+=1
    end_idx+=1

In [23]:
def split_data(data_dd, y_col=None, cols_to_ignore=None, window_size=24):
    """
    given data_dd (scaled data from some manner), we will return a dictionary of each feature in sequence format. By default the window will be 24 which represents 24 time steps -> in this case a single day of data. 
    
    returns: 
        ddd: <dict> representing each feature with sequences of data. [0] will contain [n0, nM] where M is the timestep size or 'window_size'. [1] will be [[0]n0+1, [0]nM+1]. therefore, each idx in the array will be a single timestep ahead of the previous one. 
    """
    c2i = [y_col] + [cols_to_ignore] if not isinstance(cols_to_ignore, list) else cols_to_ignore
    cols2ddd = [c for c in list(data.columns) if c not in c2i]
    ddd = {c:[] for c in cols2ddd}

    n = len(data_dd)

    start_idx=0
    end_idx=window_size
    last_idx=n-window_size

    while start_idx<=last_idx:
        for c in cols2ddd:
            sample = data_dd[c]
            win = sample.iloc[start_idx:end_idx].values
            ddd[c].append(win)
        start_idx+=1
        end_idx+=1
        
    return ddd

In [24]:
ddd = split_data(data_dd)

In [26]:
"""
Stacking for right shape. The right shape per feature should be:
    shape: [n, n_features]
        n: number of sample, in this case is should be the length of the master datafrane after dropna
        n_features: in this case this is the sequence size. Or the 'window_size' as it represents a sequence of n_features 
"""
ddd = {k:np.stack(v) for k,v in ddd.items()}

In [29]:
def create_gasfd_defaults(size=None):
    size = 24 if size is None else size
    assert isinstance(size, int), 'size should be an int'
    gasf = GramianAngularField(image_size=size, method='summation')
    gadf = GramianAngularField(image_size=size, method='difference')
    return gasf, gadf

In [30]:
"""
GAXF, where x is either S or D. 

We will now convert our windowed data and convert each into a GAXf format. This is what will be fed into the PyTorch model STACKED for the number of features we want to use as independent vars.
"""
def gaxf_convert(ddd, c2='gasf', size=None):
    if c2.lower() not in ['gasf', 'gadf']: return
    gasf, gadf = create_gasfd_defaults(size=size)
    if c2=='gasf': return {k:gasf.fit_transform(v) for k,v in ddd.items()}
    else: return {k:gadf.fit_transform(v) for k,v in ddd.items()}

In [115]:
ddd_GASF = gaxf_convert(ddd)

In [34]:
def plot_gsxf_ts(gsxf_data_conv, i):
    """
    Plotting gaxf. gaxf_data_conv is converted data into appropriate format. This is a single timestep
    """
    plt.figure(figsize=(5,5))
    plt.imshow(gsxf_data_conv, cmap='rainbow', origin='lower')
    plt.title(f'gaxf: {i}')
    plt.colorbar(fraction=0.0457, pad=0.04)
    plt.tight_layout()
    plt.show()
    
def plot_ts(ddd_GAXF, col='open', n=5):
    for i in range(n):
        gsxf_dc = ddd_GAXF[col][i]
        plot_gsxf_ts(gsxf_dc, i)

## Feature Stacking
Now we need to stack each timestep (a single timestep in this case is a sequence of timesteps, super weird concept...). However, we will stack each timestep with n_features. Think of this as an image with n_features=n_channels. 

The reason we are going with this approach is because we can then utilize ConvNets

In [82]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

In [116]:
def feature_stack(ddd_GAXF, y_col=None, cols_to_ignore=None):
    """
    Given ddd_GAXF where X is either s or d, we will form a new dataset which is 'stacked_features' where each feature in a single timestep is stacked -> similar to an image. 
    
    The shape of each timestep will then become
    
    shape: (n, features, h, w)
    
    where
        n: the number of timesteps
        features: all the features to stack, this can differ with cols_to_ignore added
        h: height
        w: width
        
    This is very similar to an image with c=features channels 
    """
    c2i = [y_col] + [cols_to_ignore] if not isinstance(cols_to_ignore, list) else cols_to_ignore
    cols2stack = [c for c in list(ddd_GASF.keys()) if c not in c2i]
    len_timesteps = len(ddd_GAXF[cols2stack[0]])
    ddd_GAXF = {k:v[:,None,:,:] for k,v in ddd_GAXF.items()}
    stacked_features = np.concatenate([v for _,v in ddd_GAXF.items()], 1)
    return stacked_features

In [117]:
ddd_stacked = feature_stack(ddd_GASF)

In [118]:
ddd_stacked.shape

(12334, 75, 24, 24)