# R-Regression Dataset
In this notebook we will take much of what we have explored via our sandbox notebooks and create a dataset tuned for regression. We will use our stationary close to create our labels which will range from -1 to 1. 

The models that follow for each task will take in the same data, the only difference being the dataset creation. 

In [1]:
import matplotlib.pyplot as plt
import matplotlib as mpl
mpl.style.use('seaborn')
import pandas as pd
import numpy as np
import os
from pathlib import Path

## Labeling

In [2]:
def load_data(root=None, remove=None, headers=None):
    data = Path('./OmegaDev/Model_Z/raw_data/GBP_USD_H1_2016-01-01_2018-01-01.csv') if root is None else Path(root)
    headers = ['date', 'complete', 'open', 'high', 'low', 'close', 'volume'] if headers is None else headers
    df = pd.read_csv(data, header=None, names=headers)
    return df

In [3]:
def stationary_close(df, col='close', diff=1):
    """
    difference in timesteps. By default this will be set to 1 which in this case is 1 hour difference. 
    
    The difference should be the time-window that is used to label data.
    """
    return np.tanh(df[col].diff(diff))

In [4]:
"""
Window will be the timeframe we will be looking at for labeling data. That we will be taking 4 timesteps to label the 5th timestep. 

This will later be used when stacking our images as each 'image' will represent a WINDOW sequence length
"""
WINDOW = 24

In [5]:
df = load_data()
df['close_label'] = df['close']
# df['stationary_close'] = stationary_close(df, 'close', WINDOW)

## Technicals + Scaling

In [6]:
from ta import add_all_ta_features
from ta.utils import dropna
from sklearn.preprocessing import MinMaxScaler

In [7]:
df = add_all_ta_features(df, 'open', 'high', 'low', 'close', 'volume')

  dip[i] = 100 * (self._dip[i]/self._trs[i])
  din[i] = 100 * (self._din[i]/self._trs[i])


In [9]:
df.columns

Index(['date', 'complete', 'open', 'high', 'low', 'close', 'volume',
       'close_label', 'volume_adi', 'volume_obv', 'volume_cmf', 'volume_fi',
       'momentum_mfi', 'volume_em', 'volume_sma_em', 'volume_vpt',
       'volume_nvi', 'volume_vwap', 'volatility_atr', 'volatility_bbm',
       'volatility_bbh', 'volatility_bbl', 'volatility_bbw', 'volatility_bbp',
       'volatility_bbhi', 'volatility_bbli', 'volatility_kcc',
       'volatility_kch', 'volatility_kcl', 'volatility_kcw', 'volatility_kcp',
       'volatility_kchi', 'volatility_kcli', 'volatility_dcl',
       'volatility_dch', 'trend_macd', 'trend_macd_signal', 'trend_macd_diff',
       'trend_sma_fast', 'trend_sma_slow', 'trend_ema_fast', 'trend_ema_slow',
       'trend_adx', 'trend_adx_pos', 'trend_adx_neg', 'trend_vortex_ind_pos',
       'trend_vortex_ind_neg', 'trend_vortex_ind_diff', 'trend_trix',
       'trend_mass_index', 'trend_cci', 'trend_dpo', 'trend_kst',
       'trend_kst_sig', 'trend_kst_diff', 'trend_ichimoku_c

In [9]:
def scale_data(data, y_col=None, non_float_cols=None, scaler=None):
    """
    args:
        y_col: dependent var if y_col is in the master dataframe. 
        non_float_cols: this can be discreete vars that was will not use for GASF or GADF conversion - such as metadata. This data should be seperated
        scaler: scaler to use: example: MinMaxScaler(feature_range(n0, n-1)), RobustScaler(), etc
    """
    data = data.copy()
    cols_to_ignore = [y_col] + non_float_cols if isinstance(non_float_cols, list) else [y_col] + [non_float_cols]
    scaler = MinMaxScaler(feature_range=(0,1)) if scaler is None else scaler
    cols_to_scale = [c for c in list(data.columns) if c not in cols_to_ignore]
    for c in cols_to_scale:
        if data.iloc[:][c].dtype!=np.float64: data[c] = data.iloc[:][c].astype('float64')
        dd = data.iloc[:][c].values
        dd = dd[:, None]
        sc = scaler.fit(dd)
        data.iloc[:][c] = sc.transform(dd).squeeze(1)
    return data

In [10]:
"""
removing technicals we dont want
"""
remove = ['trend_psar_up', 'trend_psar_down']
df.drop(columns=remove,axis=1,inplace=True)

In [11]:
"""
Dropping na
For now we will use this dummy method. However, it would be best to fill in na with some values. 
"""
df.dropna(inplace=True)

In [12]:
# grabbing all data
data = df.iloc[:,2:].copy()
data_dd = scale_data(data, y_col='close_label')

## Data split

In [13]:
def vol_label(start_idx, df, y_col, window=4, sl_pips=20, tp_pips=40):
    """
    args:
    -----------------
        stard_idx: <int> the start index should be the last idx of the time-sequence we are using as predicted features (independent vars).
                   if our time-series is from [0,1,2,3,4] then our start_idx should be 5 so our prediction label will be: [5, ... ,window]
                   therefore, the start_idx should be be the last_idx of our time-series window
        
        df:        <df> dataframe for predicting
        
        sl_pips & tp_pips: stop loss pips and take profit pips
    """
    tf = df.iloc[start_idx:start_idx+window][y_col]
    price_n = tf.iloc[0]
    upper_bound = price_n + (0.0001 * tp_pips)
    lower_bound = price_n - (0.0001 * sl_pips)
    lbls = []
    for c in tf.values:
        if c >= upper_bound: lbls.append(1)
        elif c <= lower_bound: lbls.append(2)
    return lbls[0] if len(lbls) > 0 else 0

In [14]:
def split_data_x(data_dd, y_col=None, cols_to_ignore=None, window_size=24, lbl_window=4, sl_pips=20, tp_pips=40):
    """
    given data_dd (scaled data from some manner), we will return a dictionary of each feature in sequence format. By default the window will be 24 which represents 24 time steps -> in this case a single day of data. 
    
    returns: 
        ddd: <dict> representing each feature with sequences of data. [0] will contain [n0, nM] where M is the timestep size or 'window_size'. [1] will be [[0]n0+1, [0]nM+1]. therefore, each idx in the array will be a single timestep ahead of the previous one. 
    """
    c2i = [y_col] + [cols_to_ignore] if not isinstance(cols_to_ignore, list) else cols_to_ignore
    cols2ddd = [c for c in list(data_dd.columns) if c not in c2i]
    ddd = {c:[] for c in cols2ddd}
    ddd['y'] = []

    n = len(data_dd)

    start_idx=0
    end_idx=window_size
    last_idx=n-window_size

    while start_idx<last_idx:
        # grab features for time-series features
        for c in cols2ddd:
            sample = data_dd[c]
            win = sample.iloc[start_idx:end_idx].values
            ddd[c].append(win)
        
        # grab labels -> preceding window followed by window_size
        y = vol_label(end_idx, data_dd, y_col, window=lbl_window, sl_pips=sl_pips, tp_pips=tp_pips)
        ddd['y'].append(y)
        
        # increment
        start_idx+=1
        end_idx+=1
        
    return ddd

In [15]:
def split_data(data_dd, y_col, window_size, lbl_window, sl_pips, tp_pips):
    """
    wrappes around split_x, split_y and returns dictionary with windowed values along with y_values associated with each sequence
    """
    ddd = split_data_x(data_dd, y_col=y_col, window_size=window_size, lbl_window=lbl_window, sl_pips=sl_pips, tp_pips=tp_pips)
    return ddd

In [16]:
# def split_data_x(data_dd, y_col=None, cols_to_ignore=None, window_size=24):
#     """
#     given data_dd (scaled data from some manner), we will return a dictionary of each feature in sequence format. By default the window will be 24 which represents 24 time steps -> in this case a single day of data. 
    
#     returns: 
#         ddd: <dict> representing each feature with sequences of data. [0] will contain [n0, nM] where M is the timestep size or 'window_size'. [1] will be [[0]n0+1, [0]nM+1]. therefore, each idx in the array will be a single timestep ahead of the previous one. 
#     """
#     c2i = [y_col] + [cols_to_ignore] if not isinstance(cols_to_ignore, list) else cols_to_ignore
#     cols2ddd = [c for c in list(data.columns) if c not in c2i]
#     ddd = {c:[] for c in cols2ddd}

#     n = len(data_dd)

#     start_idx=0
#     end_idx=window_size
#     last_idx=n-window_size

#     while start_idx<last_idx:
#         for c in cols2ddd:
#             sample = data_dd[c]
#             win = sample.iloc[start_idx:end_idx].values
#             ddd[c].append(win)
#         start_idx+=1
#         end_idx+=1
        
#     return ddd

In [17]:
# def split_data_y(data_dd, y_col, window_size=4):
#     """
#     This is the same as the other split func. However, this should return the Y values depending on the window size. The window size should be the same
#     """
#     n = len(data_dd)
#     start_idx=0
#     end_idx=window_size
#     last_idx=n-window_size
#     y_values = []
    
#     while start_idx<last_idx:
#         y = data_dd.iloc[end_idx][y_col]
#         y_values.append(y)
#         start_idx+=1
#         end_idx+=1
        
#     return y_values

In [18]:
# def split_data(data_dd, y_col, window_size):
#     """
#     wrappes around split_x, split_y and returns dictionary with windowed values along with y_values associated with each sequence
#     """
#     ddd = split_data_x(data_dd, y_col=y_col, window_size=window_size)
#     y = split_data_y(data_dd, y_col=y_col, window_size=window_size)
#     ddd['y'] = y
#     return ddd

In [19]:
### grabbing timesteps in dictionary format
lbl_window=4
ddd = split_data(data_dd, y_col='close_label', window_size=WINDOW, lbl_window=lbl_window)

In [22]:
from collections import Counter

In [28]:
Counter(np.array(ddd['y']))

Counter({0: 4237, -1: 4058, 1: 4038})

In [29]:
def stack(ddd):
    """
    Stacking our dictionary from our split into the appropriate shape. This is necessary for GAXF formation and for data iteration when passing through the dataset class
    
    shape:
    ----------
    [n, n_features]
        n: number of samples, in this case this should be the length of the master dataframe after dropna (if applied)
        n_features: in this case is the sequence size. Or the 'window_size' as it represents a sequence of f type features -> a feature being close, open, etc
    """
    ddd = {k:np.stack(v) for k,v in ddd.items()}
    ddd['y'] = ddd['y'][:,None]
    return ddd

In [30]:
# stacking our features along with our y
ddd = stack(ddd)

## GAXF Conversion

In [31]:
from pyts.image import GramianAngularField

In [32]:
def create_gasfd_defaults(size=None):
    size = 24 if size is None else size
    assert isinstance(size, int), 'size should be an int'
    gasf = GramianAngularField(image_size=size, method='summation')
    gadf = GramianAngularField(image_size=size, method='difference')
    return gasf, gadf

In [33]:
def gaxf_convert(ddd, c2='gasf', size=None):
    """
    GAXF, where x is either S or D. 

    We will now convert our windowed data and convert each into a GAXf format. This is what will be fed into the PyTorch model STACKED for the number of features we want to use as independent vars.
    """
    if c2.lower() not in ['gasf', 'gadf']: return
    gasf, gadf = create_gasfd_defaults(size=size)
    if c2=='gasf': temp = {k:gasf.fit_transform(v) for k,v in ddd.items() if k!='y'}
    else: temp =  {k:gadf.fit_transform(v) for k,v in ddd.items() if k!='y'}
    temp['y'] = ddd['y']
    return temp

In [34]:
"""
Creating GASF from our timesteps. The image size will be the window size.
"""
ddd_GASF = gaxf_convert(ddd, c2='gasf', size=WINDOW)

In [35]:
ddd_GASF['open'].shape

(12333, 24, 24)

In [36]:
for k in ddd_GASF.keys():
    print(k, ddd_GASF[k].shape)

open (12333, 24, 24)
high (12333, 24, 24)
low (12333, 24, 24)
close (12333, 24, 24)
volume (12333, 24, 24)
volume_adi (12333, 24, 24)
volume_obv (12333, 24, 24)
volume_cmf (12333, 24, 24)
volume_fi (12333, 24, 24)
momentum_mfi (12333, 24, 24)
volume_em (12333, 24, 24)
volume_sma_em (12333, 24, 24)
volume_vpt (12333, 24, 24)
volume_nvi (12333, 24, 24)
volume_vwap (12333, 24, 24)
volatility_atr (12333, 24, 24)
volatility_bbm (12333, 24, 24)
volatility_bbh (12333, 24, 24)
volatility_bbl (12333, 24, 24)
volatility_bbw (12333, 24, 24)
volatility_bbp (12333, 24, 24)
volatility_bbhi (12333, 24, 24)
volatility_bbli (12333, 24, 24)
volatility_kcc (12333, 24, 24)
volatility_kch (12333, 24, 24)
volatility_kcl (12333, 24, 24)
volatility_kcw (12333, 24, 24)
volatility_kcp (12333, 24, 24)
volatility_kchi (12333, 24, 24)
volatility_kcli (12333, 24, 24)
volatility_dcl (12333, 24, 24)
volatility_dch (12333, 24, 24)
trend_macd (12333, 24, 24)
trend_macd_signal (12333, 24, 24)
trend_macd_diff (12333, 24,