In [3]:
import pandas as pd
import numpy as np
import tensorflow as tf

from train_helpers import load_raw_bitcoin_df
from sktime.forecasting.model_selection import temporal_train_test_split
from sklearn.preprocessing import MinMaxScaler


def load_raw_bitcoin_df():
    """Load the downloads/price.csv dataset of historical BTC data from 2010-2021
    and return as a dataframe without an NaN values.

    Returns
    -------
    pd.DataFrame
        Dataframe of BTC close data from 2010-2021
    """
    bitcoin = pd.read_csv(
        '../download/price.csv',
        index_col=0,
        parse_dates=True,
        names=["date", "price", "h", "l", "o"],
        usecols=["date", "price"],
        header=0,
    )
    bitcoin = bitcoin.dropna()
    return bitcoin

bitcoin = load_raw_bitcoin_df()

In [15]:
# 70% train, 20% val, 10% test
train_num = int(0.7 * len(bitcoin))
val_num = int(0.2 * len(bitcoin))

In [16]:
len(bitcoin) - train_num - val_num - test_num

1

In [17]:
train = bitcoin.iloc[:train_num, :]
val = bitcoin.iloc[train_num: train_num + val_num, :]
test = bitcoin.iloc[train_num + val_num:, :]

In [18]:
assert len(train) + len(val) + len(test) == len(bitcoin)

In [4]:
bitcoin = load_raw_bitcoin_df()

# In total we have: ~70% training, 20% val, 10% test
train, test = temporal_train_test_split(bitcoin, train_size=0.9)
train, val = temporal_train_test_split(train, train_size=0.77)

train = np.log(train)
val = np.log(val)
test = np.log(test)

min_max = MinMaxScaler()

train = min_max.fit_transform(train)
val = min_max.transform(val)
test = min_max.transform(test)

In [6]:
import plotly.express as px

px.line(train)


numpy.ndarray size changed, may indicate binary incompatibility. Expected 16 from C header, got 96 from PyObject



In [59]:
train = np.arange(100)
train = np.expand_dims(train, axis=1)
train_ds = tf.data.Dataset.from_tensor_slices(train)

input_seq_length = 4
output_seq_length = 2
batch_size = 5

total_seq_length = input_seq_length + output_seq_length


train_ds = (train_ds
    .window(total_seq_length, shift=1, drop_remainder=True)
    .flat_map(lambda w: w.batch(total_seq_length, drop_remainder=True))
    .map(lambda w: (w[:-output_seq_length], w[-output_seq_length:]))
    .batch(batch_size, drop_remainder=True)
    .prefetch(tf.data.AUTOTUNE)
)

print(type(train_ds))

for x in train_ds.take(2):
    print(x)

<class 'tensorflow.python.data.ops.dataset_ops.PrefetchDataset'>
(<tf.Tensor: shape=(5, 4, 1), dtype=int64, numpy=
array([[[0],
        [1],
        [2],
        [3]],

       [[1],
        [2],
        [3],
        [4]],

       [[2],
        [3],
        [4],
        [5]],

       [[3],
        [4],
        [5],
        [6]],

       [[4],
        [5],
        [6],
        [7]]])>, <tf.Tensor: shape=(5, 2, 1), dtype=int64, numpy=
array([[[4],
        [5]],

       [[5],
        [6]],

       [[6],
        [7]],

       [[7],
        [8]],

       [[8],
        [9]]])>)
(<tf.Tensor: shape=(5, 4, 1), dtype=int64, numpy=
array([[[ 5],
        [ 6],
        [ 7],
        [ 8]],

       [[ 6],
        [ 7],
        [ 8],
        [ 9]],

       [[ 7],
        [ 8],
        [ 9],
        [10]],

       [[ 8],
        [ 9],
        [10],
        [11]],

       [[ 9],
        [10],
        [11],
        [12]]])>, <tf.Tensor: shape=(5, 2, 1), dtype=int64, numpy=
array([[[ 9],
        [10]],

 

In [55]:
len(np.arange(10).shape)

1