# Making sure that the preprocessing pipeline works

### Creating the data (trying out with 1 stock only)

In [1]:
import sys
import os

sys.path.insert(1, "../lib/")
from utils import *

In [2]:
os.chdir("../data/Stocks/")

In [3]:
df = create_data(["a.us.txt"])

1  out of  1


In [4]:
df.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,OpenInt,symbol
0,1999-11-18,30.713,33.754,27.002,29.702,66277506,0,a.us.txt
1,1999-11-19,28.986,29.027,26.872,27.257,16142920,0,a.us.txt
2,1999-11-22,27.886,29.702,27.044,29.702,6970266,0,a.us.txt
3,1999-11-23,28.688,29.446,27.002,27.002,6332082,0,a.us.txt
4,1999-11-24,27.083,28.309,27.002,27.717,5132147,0,a.us.txt


### We first remove timestamp and scale it to a numpy array

In [5]:
data = clean_and_scale(df)

In [6]:
data

array([[0.23950492, 0.25761933, 0.21933581, 0.2237905 , 1.        ],
       [0.22189629, 0.21121072, 0.21790398, 0.19933957, 0.24356559],
       [0.2106806 , 0.21783771, 0.2197984 , 0.2237905 , 0.1051679 ],
       ...,
       [0.62070579, 0.59707862, 0.66836723, 0.6078851 , 0.0146749 ],
       [0.6188705 , 0.5936424 , 0.65888415, 0.60148486, 0.0252436 ],
       [0.61305873, 0.58971529, 0.6565712 , 0.59488461, 0.02571836]])

### We then do a train test split

In [7]:
train_set, test_set = train_test_split(data, train_size=0.75, test_size=0.25, shuffle=False)

### We now want to reshape the data for the LSTM model

#### We first build a time series for the number of time steps

In [8]:
TIME_STEPS = 3
BATCH_SIZE = 20

In [9]:
train_ts = build_timeseries(train_set, 3, TIME_STEPS, 'train')
test_ts = build_timeseries(test_set, 3, TIME_STEPS, 'test')

In [10]:
train_ts

array([[[0.23950492, 0.25761933, 0.21933581, 0.2237905 , 1.        ],
        [0.22189629, 0.21121072, 0.21790398, 0.19933957, 0.24356559],
        [0.2106806 , 0.21783771, 0.2197984 , 0.2237905 , 0.1051679 ]],

       [[0.22189629, 0.21121072, 0.21790398, 0.19933957, 0.24356559],
        [0.2106806 , 0.21783771, 0.2197984 , 0.2237905 , 0.1051679 ],
        [0.21885786, 0.21532436, 0.21933581, 0.19678948, 0.09553893]],

       [[0.2106806 , 0.21783771, 0.2197984 , 0.2237905 , 0.1051679 ],
        [0.21885786, 0.21532436, 0.21933581, 0.19678948, 0.09553893],
        [0.20249315, 0.20416156, 0.21933581, 0.20393975, 0.07743422]],

       ...,

       [[0.22695354, 0.22057687, 0.2460779 , 0.22508055, 0.10288199],
        [0.22912531, 0.21866241, 0.24363278, 0.2210704 , 0.09403059],
        [0.22598492, 0.22200045, 0.24486636, 0.22741064, 0.13428407]],

       [[0.22912531, 0.21866241, 0.24363278, 0.2210704 , 0.09403059],
        [0.22598492, 0.22200045, 0.24486636, 0.22741064, 0.13428407],

In [11]:
test_ts

array([0.2395211 , 0.23847106, 0.24424128, ..., 0.6078851 , 0.60148486,
       0.59488461])

#### We then want to trim the time series to make sure it will fit the batch size correctly

In [12]:
train_trimmed = trim_dataset(train_ts, BATCH_SIZE)
test_trimmed = trim_dataset(test_ts, BATCH_SIZE)

In [13]:
train_trimmed.shape

(3380, 3, 5)

In [14]:
test_trimmed.shape

(1120,)

### We now want to create a validation set of equal size as test set

In [15]:
val_set, test_set = np.split(test_trimmed, 2)

In [18]:
val_set.shape

(560,)

In [19]:
test_set.shape

(560,)

## We now have out train, validation, and test set!

## The preprocessing pipeline function combines the whole process!!

In [20]:
data = clean_and_scale(df)
pr_train_set, pr_val_set, pr_test_set = preproc_pipeline(data, TIME_STEPS, BATCH_SIZE)

In [21]:
print(pr_train_set.shape)
print(pr_val_set.shape)
print(pr_test_set.shape)

(3380, 3, 5)
(560,)
(560,)
