# Data Engineering and Preparation with Mock Data
This notebook is aimed at showing the most important steps of the implemented data preparation with mock data. This serves as a sanity test due to the fact that we exactly now what vales we expect. The raw mock data is a matrix where each element of a row contains the row index (indexing starting from 1).

## Turning Data into Series

In [1]:
import numpy as np
import pandas as pd
import torch

In [2]:
def data_to_sequences(data, seq_length):
    sequences = data.unfold(0, seq_length, 1)
    sequences = torch.transpose(sequences, 1, 2)
    return sequences

In [3]:
#Dummfy DF
arr = np.ones((100, 10)) * np.arange(1,101).reshape(-1,1)
data = torch.tensor(arr, dtype=torch.float32)
print(f'Dummy Data: \n{data[:,:]}')

sequences = data_to_sequences(data, 15)

print(f'First Element of Dummy Sequences: \n{sequences[0,:,:]}')
print(f'Second Element of Dummy Sequences: \n{sequences[1,:,:]}')


Dummy Data: 
tensor([[  1.,   1.,   1.,   1.,   1.,   1.,   1.,   1.,   1.,   1.],
        [  2.,   2.,   2.,   2.,   2.,   2.,   2.,   2.,   2.,   2.],
        [  3.,   3.,   3.,   3.,   3.,   3.,   3.,   3.,   3.,   3.],
        [  4.,   4.,   4.,   4.,   4.,   4.,   4.,   4.,   4.,   4.],
        [  5.,   5.,   5.,   5.,   5.,   5.,   5.,   5.,   5.,   5.],
        [  6.,   6.,   6.,   6.,   6.,   6.,   6.,   6.,   6.,   6.],
        [  7.,   7.,   7.,   7.,   7.,   7.,   7.,   7.,   7.,   7.],
        [  8.,   8.,   8.,   8.,   8.,   8.,   8.,   8.,   8.,   8.],
        [  9.,   9.,   9.,   9.,   9.,   9.,   9.,   9.,   9.,   9.],
        [ 10.,  10.,  10.,  10.,  10.,  10.,  10.,  10.,  10.,  10.],
        [ 11.,  11.,  11.,  11.,  11.,  11.,  11.,  11.,  11.,  11.],
        [ 12.,  12.,  12.,  12.,  12.,  12.,  12.,  12.,  12.,  12.],
        [ 13.,  13.,  13.,  13.,  13.,  13.,  13.,  13.,  13.,  13.],
        [ 14.,  14.,  14.,  14.,  14.,  14.,  14.,  14.,  14.,  14.],
       

### Turning Sequences into Batches

In [5]:
def batch_tensor(tensor, batch_size):
    """
    Splits a tensor into batches along the first dimension.

    Args:
    tensor (torch.Tensor): The input tensor to be batched.
    batch_size (int): The size of each batch.

    Returns:
    list of torch.Tensor: A list of the batches.
    """
    # Split the tensor into batches along the first dimension
    batches = torch.split(tensor, batch_size)
    return list(batches)

sequences
batches = batch_tensor(sequences, 64)
batches[0].shape

torch.Size([64, 15, 10])

### Complete Pipeline for Dummy Data: Data Frame -> Sequences -> Batches

In [6]:
def df_list_to_series_tensor(data_list, sequence_lenght, shuffle=True):
    # Turning data into sequences
    tensor_list = []
    for i in range(len(data_list)):
        sequences = data_to_sequences(data_list[i], sequence_lenght)
        tensor_list.append(sequences)
    # Aggregating the sequences in one tensor
    collected_tensor = torch.cat(tensor_list, dim=0)

    if shuffle:
        # Shuffling the sequences
        shuffled_ind = torch.randperm(collected_tensor.size(0))
        collected_tensor = collected_tensor[shuffled_ind]
    return collected_tensor

In [7]:
# Parameters
time_steps_in_data = 10**4
nr_stocks = 200
sequence_lenght = 10
batch_size = 7
feature_dimensions = 12

# Implementing Dummy Data
arr = np.ones((time_steps_in_data, feature_dimensions)) * np.arange(1,time_steps_in_data+1).reshape(-1,1)
data = torch.tensor(arr, dtype=torch.float32) # Data for one Stock
data_list = [data] * nr_stocks # List of Data for Different Stocks

# Turning data into sequences
collected_tensor = df_list_to_series_tensor(data_list, sequence_lenght, shuffle=True)

# Turning the aggregated sequences into batches
batches = batch_tensor(collected_tensor, batch_size)
