# Tensorflow data set tutorial.

In this tutorial is shown how the raw data from the api is used for constructing a tf dataset for time series forecasting

In [2]:
import pandas as pd
import tensorflow as tf
import numpy
import pickle as pkl
import os
os.chdir(os.path.join(os.getcwd(), ".."))
import finviz as fz
import yfinance as yf
import calendar
import datetime
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import precision_recall_curve, auc, average_precision_score, log_loss
#from sklearn.metrics import PrecisionRecallDisplay

from src.utils import data_layer, get_dataset, hist_multi_plot

%load_ext autoreload
%autoreload 2

In [3]:
perc_space = 0.8
perc_time = 0.8

WINDOW_SIZE = 21 # Include window + target
BATCH_SIZE = 32

## Load and Data formatting

In [7]:
data = pd.read_csv("http://35.164.216.200:8080/raw_data/AAPL")

The following is the raw data returned by the API. For this particular example one year of Apple inc stock

In [8]:
data.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits,S_transactionpricepershare,A_transactionshares,...,M_transactionshares,S_transactionshares,A_transactionvalue,G_transactionvalue,M_transactionvalue,S_transactionvalue,Amount_negotiated,Amount_negotiated_MA,Perc_amount_vs_MA,Perc_amount_sp100
0,2019-04-17,196.43,200.21,195.51,199.96,28906800,0.0,0,,,...,,,,,,,5780204000.0,,,5.19253
1,2019-04-18,199.95,200.97,199.36,200.68,24195800,0.0,0,,,...,,,,,,,4855613000.0,,,4.950565
2,2019-04-22,199.67,201.75,199.19,201.34,19439500,0.0,0,,,...,,,,,,,3913949000.0,,,4.89108
3,2019-04-23,201.24,204.51,200.72,204.25,23323000,0.0,0,,,...,,,,,,,4763723000.0,,,4.600167
4,2019-04-24,204.13,205.23,203.82,203.93,17540600,0.0,0,,,...,,,,,,,3577055000.0,,,3.619047


The dates are discarded as they are not features

In [15]:
features = data[[ 'Open', 'High', 'Low', 'Close', 'Volume', 'Dividends',
       'Stock Splits', 'S_transactionpricepershare', 'A_transactionshares',
       'G_transactionshares', 'M_transactionshares', 'S_transactionshares',
       'A_transactionvalue', 'G_transactionvalue', 'M_transactionvalue',
       'S_transactionvalue', 'Amount_negotiated', 'Amount_negotiated_MA',
       'Perc_amount_vs_MA', 'Perc_amount_sp100', 'S_transactionshares']]

In this example the labels will be binary, 1 if there was an insider sell or 0 otherwise. 

In [16]:
features['S_transactionshares'] = 1 - features['S_transactionshares'].isna().astype('int64') 

Now window shifted by on etime step are builded, and the last row per window is discarded. Also the label is extracted

In [20]:
dataset = tf.data.Dataset.from_tensor_slices(features.values)
dataset = dataset.window(WINDOW_SIZE, shift=1, drop_remainder=True)
dataset = dataset.flat_map(lambda window: window.batch(WINDOW_SIZE))
dataset = dataset.map(lambda window: (window[:-1], window[-1:, -1:]))

Now we are going to split the data set in train and validation:

In [26]:
samples = data.shape[0]
train_samples = int((samples-WINDOW_SIZE)*perc_time)

In [27]:
train_datasets = []
valid_datasets = []

In [28]:
train_datasets.append(dataset.take(train_samples))
valid_datasets.append(dataset.skip(train_samples))

In [29]:
train_ds = None
val_ds = None
for idx, sample in enumerate(train_datasets):
    if train_ds is None:
        train_ds = sample
        val_ds = valid_datasets[idx]
    else:
        train_ds = train_ds.concatenate(sample)
        val_ds = val_ds.concatenate(valid_datasets[idx])


### And that's it!