In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from datetime import timedelta

from lib.data.dataset import Dataset
from lib.scraper.scraper import Scraper
from lib.scraper.ticker import CommonTickers
from lib.data.feature_generator import FeatureGenerator
from lib.data.features.common_features import CommonFeatures
from lib.data.features.common_targets import CommonTargets

INFO:tensorflow:Enabling eager execution
INFO:tensorflow:Enabling v2 tensorshape
INFO:tensorflow:Enabling resource variables
INFO:tensorflow:Enabling tensor equality
INFO:tensorflow:Enabling control flow v2


First, we need to scrape the data from IEX cloud. This is done via the Scraper class.

In [2]:
s = Scraper(
    config_file='lib/scraper/config/iexcloud-sandbox-private.json',
    should_print=False
)

s

<lib.scraper.scraper.Scraper at 0x108492700>

Next, we need to actually scrape the data.


*Note: In this example, we will use sandbox data from IEXCloud. This data is random and does not reflect actual stock data. Therefor, this example is for demonstration purposes only.*

In [3]:
disney_data, disney_filename = s.get_intraday_stock_data(
    ticker=CommonTickers.DISNEY,
    start='2021-04-20',
    end='2021-04-29',
    time_delta=timedelta(days=1),
    save_data=True
)

disney_data.head()

Unnamed: 0_level_0,minute,label,high,low,average,volume,notional,numberOfTrades,marketHigh,marketLow,marketAverage,marketVolume,marketNotional,marketNumberOfTrades,open,close,marketOpen,marketClose,changeOverTime,marketChangeOverTime
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2021-04-20,09:30,09:30 AM,194.55,187.393,195.423,3606,663927.138,92,192.13,191.5,193.288,202651,37428570.0,211,191.79,191.44,194.33,193.64,0.0,0.0
2021-04-20,09:31,09:31 AM,193.48,191.74,196.159,582,106573.47,12,189.18,193.926,188.78,34240,6383953.0,123,196.73,187.59,190.73,189.859,-0.000486,0.001917
2021-04-20,09:32,09:32 AM,190.179,192.01,192.846,211,37880.13,7,194.74,192.39,187.12,25793,4952529.0,144,193.784,190.467,190.5,196.13,-0.003824,-0.001401
2021-04-20,09:33,09:33 AM,191.54,186.92,196.008,912,167921.87,10,187.41,188.89,196.065,12507,2388168.0,98,188.06,193.16,191.33,192.73,-0.004421,-0.002486
2021-04-20,09:34,09:34 AM,191.81,186.75,193.991,536,101135.64,9,187.63,193.93,188.021,9751,1829149.0,74,195.43,192.69,187.08,193.1,-0.004533,-0.002895


Now, we need to generate some features for our data and clean it up a bit:

In [4]:
feature_generator = FeatureGenerator(
    filename=disney_filename,
    auto_clean=True,
    parse_dates=True
)

feature_generator

<lib.data.feature_generator.FeatureGenerator at 0x17f16fbb0>

In [5]:
feature_generator.build_features(
    [
        CommonFeatures.Sinify('day_of_year', period=365),
        CommonFeatures.Cosify('day_of_year', period=365),
        CommonFeatures.Sinify('minute_of_day', period=(60 * 24)),
        CommonFeatures.Cosify('minute_of_day', period=(60 * 24)),
        CommonFeatures.OneHotEncoder('weekday'),
        CommonFeatures.OneHotEncoder('hour_of_day')
    ]
)

Now, let's add some targets to our data that we will try to predict!

In [6]:
feature_generator.build_features(
    [
        CommonTargets.FutureValue(feature='marketLow', target_time_delta=timedelta(minutes=1)),
        CommonTargets.FutureValueChange(feature='future_value')  # This is the feature generated directly above
    ]

)

Now that we have our features generated, we can generate a dataset from our features:

In [7]:
# We don't want to include the original value of the one-hot-encoded variables
features_to_exclude = ['weekday', 'hour_of_day']
exported_data = feature_generator.export(
    target_feature='future_value_change',
    features_to_exclude=features_to_exclude
)
exported_data.head()

Unnamed: 0_level_0,marketHigh,marketLow,marketAverage,marketVolume,marketNotional,marketNumberOfTrades,marketOpen,marketClose,marketChangeOverTime,year,...,weekday_Tuesday,weekday_Wednesday,hour_of_day_10,hour_of_day_11,hour_of_day_12,hour_of_day_13,hour_of_day_14,hour_of_day_15,future_value,future_value_change
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-04-20 09:30:00,192.13,191.5,193.288,202651.0,37428570.0,211.0,194.33,193.64,0.0,2021.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,193.926,-0.007921
2021-04-20 09:31:00,189.18,193.926,188.78,34240.0,6383953.0,123.0,190.73,189.859,0.001917,2021.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,192.39,-0.018192
2021-04-20 09:32:00,194.74,192.39,187.12,25793.0,4952529.0,144.0,190.5,196.13,-0.001401,2021.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,188.89,0.026682
2021-04-20 09:33:00,187.41,188.89,196.065,12507.0,2388168.0,98.0,191.33,192.73,-0.002486,2021.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,193.93,0.007941
2021-04-20 09:34:00,187.63,193.93,188.021,9751.0,1829149.0,74.0,187.08,193.1,-0.002895,2021.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,195.47,-0.043536


Now, we will instantiate an instance of the Dataset class, which manages normalization, train/test split, and saving/loading for us:

In [8]:
dataset = Dataset(
    df=exported_data,
    train_fraction=0.7,
    target_max_threshold=float('inf')
)

dataset.train.shape

(1842, 60, 29)

We can easily save this dataset to the disk for later use:

In [9]:
data_path = dataset.save_to_disk(CommonTickers.ATT.ticker)

Successfully saved dataset to `exported_data/datasets/1619751670-T/*`


And we can reload that dataset easily, like this:

In [10]:
dataset_from_disk = Dataset(folder_path=data_path)

For a sanity check, let's make sure all of the data was saved/retrieved losslessly:

In [11]:
np.array_equal(dataset.train, dataset_from_disk.train)

True

In [12]:
np.array_equal(dataset.val, dataset_from_disk.val)

True

In [13]:
np.array_equal(dataset.test, dataset_from_disk.test)

True

Now that we have all of our data loaded, let's train an example model!

In [14]:
def compile_and_fit(model, ds, patience=5, epochs=50):

    model.compile(loss=tf.losses.MeanSquaredError(),
                optimizer=tf.optimizers.Adam(),
                metrics=[tf.metrics.MeanAbsoluteError()])

    history = model.fit(ds.train_X, ds.train_y, epochs=epochs,
                      validation_data=(ds.val_X, ds.val_y))
    return history

In [15]:
dense_model = tf.keras.Sequential([
    tf.keras.layers.Dense(units=60, activation='relu'),
    tf.keras.layers.Dense(units=1)
])
history = compile_and_fit(dense_model, dataset, 5999, 10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


Obviously this model sucks lol since this is completely random data we are working with (because we pulled from sandbox). BUT the loss does decrease with each epoch, so something must be working right!

Feel free to mess around with the library and create your own models!