# TimeSeries Dataset for Machine Learning:

Windowing the data

Splitting into features and labels 

Coursera Colab Link: https://colab.research.google.com/github/lmoroney/dlaicourse/blob/master/TensorFlow%20In%20Practice/Course%204%20-%20S%2BP/S%2BP%20Week%202%20Lesson%201.ipynb

TF Dataset:  https://www.tensorflow.org/api_docs/python/tf/data/Dataset


In [3]:
%tensorflow_version 2.x
import tensorflow as tf
print(tf.__version__)

import numpy as np
import matplotlib.pyplot as plt

2.2.0-rc2


In [4]:
# Creating a dataset with TF:
# TF creates tensors. This must be converted to numpy to be ML-ed!!!

dataset = tf.data.Dataset.range(10)  # you could add .as_numpy_iterator() to convert it to numpy list straight away, instead of using .numpy() below

for val in dataset:
  print(val.numpy())

0
1
2
3
4
5
6
7
8
9


In [19]:
# Windowing the data by 5 and then shifting by 1

dataset = tf.data.Dataset.range(10)  # you could add .as_numpy_iterator() to convert it to numpy list straight away, instead of using .numpy() below
dataset = dataset.window(5, shift=1)  # grabbing 5 elements at a time shifted by 1

for window in dataset:
  print(list(window.as_numpy_iterator()))

# print(dataset.numpy()) # doesnt work
# print(dataset.as_numpy_iterator()) # doesnt work
print()

# alternative:
for window in dataset:
  for val in window:
    print(val.numpy(), end=' ')  # the end=' '  makes the print continue the same line until the for is finished and then it places the ' 'in the end.
  print()

[0, 1, 2, 3, 4]
[1, 2, 3, 4, 5]
[2, 3, 4, 5, 6]
[3, 4, 5, 6, 7]
[4, 5, 6, 7, 8]
[5, 6, 7, 8, 9]
[6, 7, 8, 9]
[7, 8, 9]
[8, 9]
[9]

0 1 2 3 4 
1 2 3 4 5 
2 3 4 5 6 
3 4 5 6 7 
4 5 6 7 8 
5 6 7 8 9 
6 7 8 9 
7 8 9 
8 9 
9 


In [22]:
# Windowing + capping shorter sequences

dataset = tf.data.Dataset.range(10)  # you could add .as_numpy_iterator() to convert it to numpy list straight away, instead of using .numpy() below
dataset = dataset.window(5, shift=1, drop_remainder=True)  # grabbing 5 elements at a time shifted by 1
                                     # drop_remainder drops all lines shorter than 5             


for window in dataset:
  print(list(window.as_numpy_iterator()))

# print(dataset.numpy()) # doesnt work
# print(dataset.as_numpy_iterator()) # doesnt work
print()

# alternative:
for window in dataset:
  for val in window:
    print(val.numpy(), end=' ')  # the end=' '  makes the print continue the same line until the for is finished and then it places the ' 'in the end.
  print()


[0, 1, 2, 3, 4]
[1, 2, 3, 4, 5]
[2, 3, 4, 5, 6]
[3, 4, 5, 6, 7]
[4, 5, 6, 7, 8]
[5, 6, 7, 8, 9]

0 1 2 3 4 
1 2 3 4 5 
2 3 4 5 6 
3 4 5 6 7 
4 5 6 7 8 
5 6 7 8 9 


In [26]:
# TF likes its data to be in numpy format

# Windowing + capping shorter sequences

dataset = tf.data.Dataset.range(10)  # you could add .as_numpy_iterator() to convert it to numpy list straight away, instead of using .numpy() below
dataset = dataset.window(5, shift=1, drop_remainder=True)  # grabbing 5 elements at a time shifted by 1
                                     # drop_remainder drops all lines shorter than 5             
dataset = dataset.flat_map(lambda window : window.batch(5))  # this maps and flattens the values of the dataset onto batches of 5

# However, once batched we can't use as_numpy_iterator() we will used .numpy() instead
for window in dataset:
  print(window.numpy())

# print(dataset.numpy()) # doesnt work
# print(dataset.as_numpy_iterator()) # doesnt work
print()

# alternative:
for window in dataset:
  for val in window:
    print(val.numpy(), end=' ')  # the end=' '  makes the print continue the same line until the for is finished and then it places the ' 'in the end.
  print()


[0 1 2 3 4]
[1 2 3 4 5]
[2 3 4 5 6]
[3 4 5 6 7]
[4 5 6 7 8]
[5 6 7 8 9]

0 1 2 3 4 
1 2 3 4 5 
2 3 4 5 6 
3 4 5 6 7 
4 5 6 7 8 
5 6 7 8 9 


In [34]:
# Splitting the dataset into features and labels:

dataset = tf.data.Dataset.range(10)  # you could add .as_numpy_iterator() to convert it to numpy list straight away, instead of using .numpy() below
dataset = dataset.window(5, shift=1, drop_remainder=True)  # grabbing 5 elements at a time shifted by 1
                                     # drop_remainder drops all lines shorter than 5             
dataset = dataset.flat_map(lambda window : window.batch(5))  # this maps and flattens the values of the dataset onto batches of 5. This is needed for the split mapping underneath.
dataset = dataset.map(lambda window: (window[:-1], window[-1:]))  # this maps the splitting (into n-1 features and last as labels) function onto the dataset

# However, once batched we can't use as_numpy_iterator() we will used .numpy() instead
for x, y in dataset:
  print(x.numpy(), y.numpy())

# print(dataset.numpy()) # doesnt work
# print(dataset.as_numpy_iterator()) # doesnt work
print()

# alternative:
for window in dataset:
  for val in window:
    print(val.numpy(), end=' ')  # the end=' '  makes the print continue the same line until the for is finished and then it places the ' 'in the end.
  print()


[0 1 2 3] [4]
[1 2 3 4] [5]
[2 3 4 5] [6]
[3 4 5 6] [7]
[4 5 6 7] [8]
[5 6 7 8] [9]

[0 1 2 3] [4] 
[1 2 3 4] [5] 
[2 3 4 5] [6] 
[3 4 5 6] [7] 
[4 5 6 7] [8] 
[5 6 7 8] [9] 


In [37]:
# Shuffle the data:

dataset = tf.data.Dataset.range(10)  # you could add .as_numpy_iterator() to convert it to numpy list straight away, instead of using .numpy() below
dataset = dataset.window(size=5, shift=1, drop_remainder=True)  # grabbing 5 elements at a time shifted by 1
                                     # drop_remainder drops all lines shorter than 5             
dataset = dataset.flat_map(lambda window : window.batch(5))  # this maps and flattens the values of the dataset onto batches of 5. This is needed for the split mapping underneath.
dataset = dataset.map(lambda window: (window[:-1], window[-1:]))  # this maps the splitting (into n-1 features and last as labels) function onto the dataset
dataset = dataset.shuffle(buffer_size=10, seed=50)  # shuffles the data

# However, once batched we can't use as_numpy_iterator() we will used .numpy() instead
for x, y in dataset:
  print(x.numpy(), y.numpy())

# print(dataset.numpy()) # doesnt work
# print(dataset.as_numpy_iterator()) # doesnt work
print()

# alternative:
for window in dataset:
  for val in window:
    print(val.numpy(), end=' ')  # the end=' '  makes the print continue the same line until the for is finished and then it places the ' 'in the end.
  print()

[3 4 5 6] [7]
[4 5 6 7] [8]
[1 2 3 4] [5]
[0 1 2 3] [4]
[5 6 7 8] [9]
[2 3 4 5] [6]

[2 3 4 5] [6] 
[4 5 6 7] [8] 
[3 4 5 6] [7] 
[0 1 2 3] [4] 
[5 6 7 8] [9] 
[1 2 3 4] [5] 


In [41]:
# Batching the dataset:

dataset = tf.data.Dataset.range(10)  # you could add .as_numpy_iterator() to convert it to numpy list straight away, instead of using .numpy() below
dataset = dataset.window(size=5, shift=1, drop_remainder=True)  # grabbing 5 elements at a time shifted by 1
                                     # drop_remainder drops all lines shorter than 5             
dataset = dataset.flat_map(lambda window : window.batch(5))  # this maps and flattens the values of the dataset onto batches of 5. This is needed for the split mapping underneath.
dataset = dataset.map(lambda window: (window[:-1], window[-1:]))  # this maps the splitting (into n-1 features and last as labels) function onto the dataset
dataset = dataset.shuffle(buffer_size=10, seed=50)
dataset = dataset.batch(batch_size=2).prefetch(1)  # takes batches of 2 windows each batch. Prefetch means how many batches are prepared at the same time.

# However, once batched we can't use as_numpy_iterator() we will used .numpy() instead
for x, y in dataset:
  print('x=', x.numpy())
  print('y=', y.numpy())

# print(dataset.numpy()) # doesnt work
# print(dataset.as_numpy_iterator()) # doesnt work
print()

# alternative:
for window in dataset:
  for val in window:
    print(val.numpy(), end=' end of window ')  # the end=' '  makes the print continue the same line until the for is finished and then it places the ' 'in the end.
  print()

x= [[3 4 5 6]
 [4 5 6 7]]
y= [[7]
 [8]]
x= [[1 2 3 4]
 [0 1 2 3]]
y= [[5]
 [4]]
x= [[5 6 7 8]
 [2 3 4 5]]
y= [[9]
 [6]]

[[2 3 4 5]
 [4 5 6 7]] end of window [[6]
 [8]] end of window 
[[3 4 5 6]
 [0 1 2 3]] end of window [[7]
 [4]] end of window 
[[5 6 7 8]
 [1 2 3 4]] end of window [[9]
 [5]] end of window 
