# Feature Engineering Development

This notebook is used for the development of new features and transformers. Once development on a transformer is complete it will be saved as a new file in `src/prep/`

## Target variable creation

Currently, the dataset has no suitable target variables, so they will have to be created. There a multiple directions this could take, however for v1 it the target will be binary, with the following rules:

if: Open > Close then 1
else: 0

This column will then need to be shifted up the dataset one row.

Since the datasets have less than 50k rows computing this in numpy is more efficient.

In [1]:
# Import required libraries
import pandas as pd
import numpy as np

# Importing ibm_historcial.csv to serve as test for development
ibm = pd.read_csv('../data/ibm_historical.csv', delimiter=',')

In [2]:
ibm[:5]

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits
0,2000-01-03 00:00:00-05:00,62.830047,64.82077,62.515725,64.82077,10823694,0.0,0.0
1,2000-01-04 00:00:00-05:00,63.703158,63.982558,61.956909,62.620483,8606279,0.0,0.0
2,2000-01-05 00:00:00-05:00,63.109447,66.916272,62.655425,64.82077,13318927,0.0,0.0
3,2000-01-06 00:00:00-05:00,65.938384,66.462258,63.423786,63.703186,8338607,0.0,0.0
4,2000-01-07 00:00:00-05:00,65.51926,65.903434,61.817214,63.423763,12402108,0.0,0.0


In [3]:
# Testing inequality
(ibm['Open'] > ibm['Close']).astype(float)[:5]

0    0.0
1    1.0
2    0.0
3    1.0
4    1.0
dtype: float64

In [4]:
ibm['target'] = (ibm['Open'] < ibm['Close']).astype(float).shift(-1)

ibm['target'][:5]

0    0.0
1    1.0
2    0.0
3    0.0
4    1.0
Name: target, dtype: float64

### Create Function

In [5]:
def calculateTarget(df):
    df['target'] = (df['Open'] < df['Close']).astype(float).shift(-1)
    return df

In [6]:
# Test function operates as intended
ibm = pd.read_csv('../data/ibm_historical.csv')

calculateTarget(ibm)

ibm[['Open', 'Close', 'target']][:5], ibm[['Open', 'Close', 'target']][-5:]

(        Open      Close  target
 0  62.830047  64.820770     0.0
 1  63.703158  62.620483     1.0
 2  63.109447  64.820770     0.0
 3  65.938384  63.703186     0.0
 4  65.519260  63.423763     1.0,
             Open       Close  target
 5721  121.650002  122.760002     0.0
 5722  121.849998  121.629997     0.0
 5723  121.660004  118.809998     1.0
 5724  120.160004  121.510002     1.0
 5725  122.800003  125.500000     NaN)

### Package into python file

In [7]:
%%writefile ../src/dataPreparation/createTarget.py
import pandas as pd


def calculateTarget(df):
    """
    Calculates target variable from a pandas dataframe, does this in an inplace fashion.
    Dataframe MUST contain columns 'Open' and 'Close'.
    :param df:
    """
    df['target'] = (df['Open'] < df['Close']).astype(float).shift(-1)
    return df

Overwriting ../src/dataPreparation/createTarget.py


## Non Temporal CNN data preparation

This approach involves consoladating a specific window of data from the dataset into a matrix, which is exactly the same format an image would be propergated through a neural network.

This will require function to be created that 'packages' the data into the matrices.


In [1]:
# Import required libraries
import pandas as pd
import tensorflow as tf
import numpy as np
import src.dataPreparation as prep

# Import data for development
msft = pd.read_csv('../data/msft_historical.csv', delimiter=',')

# Remove Date from dataframe since it is not required anymore
msft.drop(columns=['Date'], inplace=True)

# Calculate target variable
msft = prep.calculateTarget(msft)

# Separate x and y
X, y = msft.drop(columns='target'), msft.target

In [2]:
# Convert pandas dataframe to tensorflow tensor to improve computational speed
X_tensor = tf.convert_to_tensor(X)
y_tensor = tf.convert_to_tensor(y)

Metal device set to: Apple M1 Pro


2022-10-10 21:00:48.006472: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2022-10-10 21:00:48.006595: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [3]:
# Creating the first packet of data for window size of 5
first_packet_x = X_tensor[:5]
first_packet_y = y_tensor[5]

In [4]:
first_packet_x, first_packet_y

(<tf.Tensor: shape=(5, 7), dtype=float64, numpy=
 array([[3.68101952e+01, 3.72022101e+01, 3.51245314e+01, 3.65553856e+01,
         5.32284000e+07, 0.00000000e+00, 0.00000000e+00],
        [3.56145333e+01, 3.67317751e+01, 3.52029179e+01, 3.53205223e+01,
         5.41190000e+07, 0.00000000e+00, 0.00000000e+00],
        [3.48501156e+01, 3.64965777e+01, 3.43012949e+01, 3.56929474e+01,
         6.40596000e+07, 0.00000000e+00, 0.00000000e+00],
        [3.51833250e+01, 3.57125450e+01, 3.39876800e+01, 3.44972992e+01,
         5.49766000e+07, 0.00000000e+00, 0.00000000e+00],
        [3.40660831e+01, 3.52029259e+01, 3.36544676e+01, 3.49481163e+01,
         6.20136000e+07, 0.00000000e+00, 0.00000000e+00]])>,
 <tf.Tensor: shape=(), dtype=float64, numpy=0.0>)

In [7]:
# Check tensor shapes
X_tensor.shape, y_tensor.shape

(TensorShape([5726, 7]), TensorShape([5726]))

In [26]:
# Create list for tensors
X_tensor_stack = []

# Create loop logic to create packets for a window size of 5
for i, _ in enumerate(X_tensor[:-1]):
    i += 5

    packet = X_tensor[:-1][i-5:i]
    X_tensor_stack.append(packet)

# Turn stack into one tensor
X_tensor_reshape = tf.stack(X_tensor_stack[:-4])

# Find target values
y_tensor_reshape = y_tensor[5:]

X_tensor_reshape.shape, y_tensor_reshape.shape

(TensorShape([5721, 5, 7]), TensorShape([5721]))

### Create function

In [None]:
def nonTemporalTransform(X, y, window_size):
    # Convert to tensor format
    X_tensor = tf.convert_to_tensor(X)
    y_tensor = tf.convert_to_tensor(y)

    # Create list for stack
    X_tensor_stack = []

    # Package loop
    for i, _ in enumerate(X_tensor):
        i += window_size

        packet = X_tensor[:-1][i-window_size:i]
        X_tensor_stack.append(packet)

    # Convert stack into one tensor
    X_reshape = tf.stack(X_tensor_stack[:-(window_size-1)])

    # Find target values
    y_reshape = y_tensor[window_size:]

    return X_reshape, y_reshape

### Package into python file

In [35]:
%%writefile ../src/dataPreparation/nonTemporalWindowFunction.py
# Import requried libraries
import pandas as pd
import tensorflow as tf
import numpy as np

def nonTemporalTransform(X, y, window_size):
    """

    :param X: X values in pandas dataframe format
    :param y: y values in pandas dataframe format
    :param window_size: size of prediction window
    :return:
    """
    # Convert to tensor format
    X_tensor = tf.convert_to_tensor(X)
    y_tensor = tf.convert_to_tensor(y)

    # Create list for stack
    X_tensor_stack = []

    # Package loop
    for i, _ in enumerate(X_tensor):
        i += window_size

        packet = X_tensor[:-1][i-window_size:i]
        X_tensor_stack.append(packet)

    # Convert stack into one tensor
    X_reshape = tf.stack(X_tensor_stack[:len(X_tensor_stack)-window_size])

    # Find target values
    y_reshape = y_tensor[window_size:]

    return X_reshape, y_reshape


Overwriting ../src/dataPreparation/nonTemporalWindowFunction.py


## Tensor normalizer

In [1]:
# Import required libraries
import tensorflow as tf
import numpy as np
import pandas as pd
import src.dataPreparation as prep

# Import nvda_historical.csv for dev
nvda = pd.read_csv('../data/nvda_historical.csv')

# Create fake y_tensor
fake_y = tf.random.uniform(shape=(1, len(nvda)))

# Transform data into windowed format
window = 5

nvda, fake_y = prep.nonTemporalTransform(
    nvda.drop(columns=['Date']),
    fake_y,
    window_size=window
)


Metal device set to: Apple M1 Pro


2022-11-01 13:00:01.695073: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2022-11-01 13:00:01.695620: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [2]:
# Test subseting data correctly
nvda[:, :, :4], nvda[:, :, 4]  # Candlestick features and Volume respectively

(<tf.Tensor: shape=(5722, 5, 4), dtype=float64, numpy=
 array([[[  0.90364332,   0.91081553,   0.8438787 ,   0.89527589],
         [  0.87973735,   0.88212867,   0.82714492,   0.87137079],
         [  0.8462694 ,   0.86061295,   0.83073063,   0.84268373],
         [  0.84268362,   0.84268362,   0.75542712,   0.78769988],
         [  0.78411425,   0.8092148 ,   0.77216115,   0.80084825]],
 
        [[  0.87973735,   0.88212867,   0.82714492,   0.87137079],
         [  0.8462694 ,   0.86061295,   0.83073063,   0.84268373],
         [  0.84268362,   0.84268362,   0.75542712,   0.78769988],
         [  0.78411425,   0.8092148 ,   0.77216115,   0.80084825],
         [  0.80323871,   0.86061291,   0.78889517,   0.82714492]],
 
        [[  0.8462694 ,   0.86061295,   0.83073063,   0.84268373],
         [  0.84268362,   0.84268362,   0.75542712,   0.78769988],
         [  0.78411425,   0.8092148 ,   0.77216115,   0.80084825],
         [  0.80323871,   0.86061291,   0.78889517,   0.82714492],
 

In [3]:
# Testing Normalizing candlestick features
tf.linalg.normalize(nvda[:, :, :4], axis=2)[0]

<tf.Tensor: shape=(5722, 5, 4), dtype=float64, numpy=
array([[[0.50835466, 0.51238947, 0.4747334 , 0.50364747],
        [0.50829418, 0.50967584, 0.47790736, 0.50346015],
        [0.50066736, 0.50915324, 0.49147436, 0.49854601],
        [0.52147096, 0.52147096, 0.46747474, 0.48744583],
        [0.49519943, 0.51105143, 0.48765057, 0.50576762]],

       [[0.50829418, 0.50967584, 0.47790736, 0.50346015],
        [0.50066736, 0.50915324, 0.49147436, 0.49854601],
        [0.52147096, 0.52147096, 0.46747474, 0.48744583],
        [0.49519943, 0.51105143, 0.48765057, 0.50576762],
        [0.48952757, 0.52449382, 0.48078601, 0.50409702]],

       [[0.50066736, 0.50915324, 0.49147436, 0.49854601],
        [0.52147096, 0.52147096, 0.46747474, 0.48744583],
        [0.49519943, 0.51105143, 0.48765057, 0.50576762],
        [0.48952757, 0.52449382, 0.48078601, 0.50409702],
        [0.50726204, 0.51316062, 0.48956685, 0.48956685]],

       ...,

       [[0.49872759, 0.50639725, 0.49218231, 0.50258252],

In [4]:
# Testing Normalizing Volume feature
tf.linalg.normalize(nvda[:, :, 4], axis=1)[0]

<tf.Tensor: shape=(5722, 5), dtype=float64, numpy=
array([[0.61955496, 0.6186655 , 0.38780246, 0.24805917, 0.14656245],
       [0.66711066, 0.41816968, 0.26748367, 0.15803915, 0.53251629],
       [0.51352033, 0.32847503, 0.19407509, 0.65394013, 0.40385416],
       ...,
       [0.51960263, 0.42971241, 0.43482412, 0.42584272, 0.41826583],
       [0.44638841, 0.45169849, 0.44236854, 0.43449762, 0.46068183],
       [0.45167629, 0.4423468 , 0.43447626, 0.46065919, 0.44647657]])>

In [5]:
# Full logic

# Subset and create normalized tensors
norm_candlestick = tf.linalg.normalize(nvda[:, :, 4], axis=2)[0]
norm_volume = tf.linalg.normalize(nvda[:, :, 4])[0]

# Stitch back together in correct shape w/ stock splits and dividends

2022-11-01 13:00:08.024798: W tensorflow/core/framework/op_kernel.cc:1780] OP_REQUIRES failed at reduction_ops_common.h:147 : INVALID_ARGUMENT: Invalid reduction dimension (2 for input with 2 dimension(s)


InvalidArgumentError: {{function_node __wrapped__Sum_device_/job:localhost/replica:0/task:0/device:CPU:0}} Invalid reduction dimension (2 for input with 2 dimension(s) [Op:Sum]

## Shuffling tensors of rank 3

For the non-temporal approach learning can be improved by creating a non-sequential dataset, however how this has been structured means that I need to shuffle two tensors in the same way.

In [8]:
import tensorflow as tf
import numpy as np

In [10]:
x = tf.convert_to_tensor(np.arange(5))
y = tf.convert_to_tensor(['a', 'b', 'c', 'd', 'e'])

indices = tf.range(start=0, limit=tf.shape(x)[0])
shuffled_indices = tf.random.shuffle(indices)

shuffled_x = tf.gather(x, shuffled_indices),
shuffled_y = tf.gather(y, shuffled_indices)

print(x, y)
print(shuffled_x, shuffled_y)

tf.Tensor([0 1 2 3 4], shape=(5,), dtype=int64) tf.Tensor([b'a' b'b' b'c' b'd' b'e'], shape=(5,), dtype=string)
(<tf.Tensor: shape=(5,), dtype=int64, numpy=array([3, 4, 1, 0, 2])>,) tf.Tensor([b'd' b'e' b'b' b'a' b'c'], shape=(5,), dtype=string)


### Convert to function and test

In [14]:
def shuffle_tensors(X, y, seed=None):
    assert tf.shape(X)[0] == tf.shape(y)[0], 'X and y MUST be the same length'
    shuffled_indices = tf.random.shuffle(
        tf.range(start=0, limit=tf.shape(X)[0]),
        seed=seed
    )
    shuffled_x = tf.gather(X, shuffled_indices, axis=0)
    shuffled_y = tf.gather(y, shuffled_indices, axis=0)
    return shuffled_x, shuffled_y

In [15]:
test_1 = tf.Variable(
    [
        [1, 2],
        [3, 4],
        [5, 6]
    ])

test_2 = tf.Variable(
    [
        [11, 12],
        [13, 14],
        [15, 16]
    ]
)

shuffle_tensors(test_1, test_2)

(<tf.Tensor: shape=(3, 2), dtype=int32, numpy=
 array([[5, 6],
        [3, 4],
        [1, 2]], dtype=int32)>,
 <tf.Tensor: shape=(3, 2), dtype=int32, numpy=
 array([[15, 16],
        [13, 14],
        [11, 12]], dtype=int32)>)

In [17]:
%%writefile ../src/dataPreparation/shuffleTensors.py
# Import required libraries
import tensorflow as tf


def shuffle_tensors(X, y, seed=None):
    '''

    :param X:
    :param y:
    :param seed:
    :return:
    '''
    assert tf.shape(X)[0] == tf.shape(y)[0], 'X and y MUST be the same length'
    shuffled_indices = tf.random.shuffle(
        tf.range(start=0, limit=tf.shape(X)[0]),
        seed=seed
    )

    shuffled_X = tf.gather(X, shuffled_indices, axis=0)
    shuffled_y = tf.gather(y, shuffled_indices, axis=0)

    return shuffled_X, shuffled_y


Writing ../src/dataPreparation/shuffleTensors.py
