# DATA PIPELINE

<b>Data format</b><br>
1 Row = 1 Example = 1 Cycle<br>
Each cycle so far has:
 - Qdlin (1000,1)
 - Tdlin (1000,1)
 - Cdlin (1000,1) (WIP)
 - discharge_time (1,) (WIP)
 - IR (1,)
 - remaining_cycle_life (1,) <- target

In [5]:
import pickle
from pathlib import Path

# only taking batch1 for testing
path1 = Path("Data/batch1.pkl")
batch1 = pickle.load(open(path1, 'rb'))

# remove batteries that do not reach 80% capacity
del batch1['b1c8']
del batch1['b1c10']
del batch1['b1c12']
del batch1['b1c13']
del batch1['b1c22']

# Writing to TFRecord

In [4]:
import os
import tensorflow as tf
from tensorflow.train import FloatList, Int64List
from tensorflow.train import Feature, Features, Example

In [3]:
"""
see Hands-On Machine Learning pp.416

1. The get_cycle_features function fetches all features and targets from 
the batch1 file and convert to "Example" objects. Every Example contains 
data from one charging cycle.

2. Create a "Data/tfrecords" directory.

3. For each cell create a tfrecord file with the naming convention "b1c0.tfrecord".
The SerializeToString method creates binary data out of the Example objects that can
be read natively in TensorFlow.
"""

def get_cycle_example(cell, idx):
    cycle_example = Example(
        features=Features(
            feature={
                "IR": Feature(float_list=FloatList(value=[batch1[cell]["summary"]["IR"][idx]])),
                "Qdlin": Feature(float_list=FloatList(value=batch1[cell]["cycles"][str(idx)]["Qdlin"])),
                "Tdlin": Feature(float_list=FloatList(value=batch1[cell]["cycles"][str(idx)]["Tdlin"])),
                "Remaining_cycles": Feature(int64_list=Int64List(value=[int(batch1[cell]["cycle_life"]-idx)]))
            }
        )
    )
    return cycle_example


data_dir = "Data/tfrecords/"
if not os.path.exists(data_dir):
    os.mkdir(data_dir)
    
for cell in batch1:
    filename = os.path.join(data_dir + cell + ".tfrecord")
    with tf.io.TFRecordWriter(filename) as f:
        num_cycles = int(batch1[cell]["cycle_life"])-1
        for cycle in range(num_cycles):
            cycle_to_write = get_cycle_example(cell, cycle)
            f.write(cycle_to_write.SerializeToString())

NameError: name 'batch1' is not defined

# Reading from TFRecord

In [1]:
from tensorflow.feature_column import numeric_column, make_parse_example_spec

In [2]:
# if you haven't loaded the batch data yet and want to read from TFRecord files, 
# run this cell instead to get an index and replace "batch1.keys()" with "batch1_keys" in the cell below
batch1_keys = ['b1c0', 'b1c1', 'b1c2', 'b1c3', 'b1c4', 'b1c5', 'b1c6', 'b1c7', 'b1c9', 'b1c11', 'b1c14', 'b1c15', 'b1c16', 'b1c17', 'b1c18', 'b1c19', 'b1c20', 'b1c21', 'b1c23', 'b1c24', 'b1c25', 'b1c26', 'b1c27', 'b1c28', 'b1c29', 'b1c30', 'b1c31', 'b1c32', 'b1c33', 'b1c34', 'b1c35', 'b1c36', 'b1c37', 'b1c38', 'b1c39', 'b1c40', 'b1c41', 'b1c42', 'b1c43', 'b1c44', 'b1c45']

In [7]:
# define columns for our dataset
ir = numeric_column("IR", shape=[])
qdlin = numeric_column("Qdlin", shape=[1000])
tdlin = numeric_column("Tdlin", shape=[1000])
rem_cycles = numeric_column("Remaining_cycles", shape=[], dtype=tf.int64)
columns = [ir, qdlin, tdlin, rem_cycles]

"""
Reading the remaining code from bottom to top:

When writing to TFrecord we created one file for each cell. Now we merge the
data back into one dataset and prepare it to be fed directly into a model.

The interleave() method will create a dataset that pulls 4 file paths from the
filepath_dataset and for each one calls the function "read_tfrecords". It will then
cycle through these 4 datasets, reading one line at a time from each until all datasets
are out of items. Then it gets the next 4 file paths from the filepath_dataset and
interleaves them the same way, and so on until it runs out of file paths. 
Note: Even with parallel calls specified, data within batches is still sequential.

The read_tfrecords() function reads a file, skipping the first row which in our case
is 0/NaN most of the time. It then loops over each example/row in the dataset and
calls the parse_feature function. Then it batches the dataset, so it always feeds
multiple examples at the same time, and then shuffles the batches. It is important 
that we batch before shuffling, so the examples within the batches stay in order.

The parse_features function takes an example and converts it from binary/message format
into a more readable format. The make_parse_example_spec generates a feature mapping 
according to the columns we defined. To be able to feed the dataset directly into a
Tensorflow model later on, we need to split the data into examples and targets (i.e. X and y).
"""
window_size = 5

def parse_features(example_proto):
    examples = tf.io.parse_single_example(example_proto, make_parse_example_spec(columns))
    targets = examples.pop("Remaining_cycles")
    return examples, targets

def flatten_windows(features, target):
    features = features.batch(window_size)
    target = target.skip(window_size-1)
    row = tf.data.Dataset.zip((features, target))
    return row

def read_tfrecords(file):
    dataset = tf.data.TFRecordDataset(file).skip(1) # skip can be removed when we have clean data
    dataset = dataset.map(parse_features)
    #dataset = dataset.window(window_size, 1, 1, True).flat_map(flatten_windows)
    #dataset = dataset.shuffle(1000).batch(10).prefetch(1) # prefetch is only relevant for CPU to GPU pipelines, see Hands-On ML p.411
    return dataset

# define files to read from and store in a list_files object
filepaths = [os.path.join("Data/tfrecords/" + cell + ".tfrecord") for cell in batch1_keys] 
filepath_dataset = tf.data.Dataset.list_files(filepaths)

dataset = filepath_dataset.interleave(read_tfrecords, cycle_length=1, num_parallel_calls=1)

In [10]:
for feature, target in dataset.take(10):
    print(target)

tf.Tensor(755, shape=(), dtype=int64)
tf.Tensor(754, shape=(), dtype=int64)
tf.Tensor(753, shape=(), dtype=int64)
tf.Tensor(752, shape=(), dtype=int64)
tf.Tensor(751, shape=(), dtype=int64)
tf.Tensor(750, shape=(), dtype=int64)
tf.Tensor(749, shape=(), dtype=int64)
tf.Tensor(748, shape=(), dtype=int64)
tf.Tensor(747, shape=(), dtype=int64)
tf.Tensor(746, shape=(), dtype=int64)


# Everything below this line is experimental

# Feed dataset into model

In [63]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import DenseFeatures, Dense, Activation, LSTM
import numpy as np

In [70]:
"""
All we need is a DenseFeatures layer that specifies the input columns.
See Hands-On ML p.426

This does not work with an RNN layer instead of Dense(18) yet.
"""
input_columns = columns[:-1]

model = Sequential()
model.add(DenseFeatures(feature_columns=input_columns))
model.add(Dense(18))
model.add(Activation("relu"))
model.add(Dense(1))
model.add(Activation("sigmoid"))

model.compile(optimizer='adam', loss='mse')

In [72]:
model.fit(dataset, epochs=10)

Epoch 1/10
    247/Unknown - 2s 8ms/step - loss: 264732.4879

KeyboardInterrupt: 

# NOTES AND ALTERNATIVES

# Two more ways to read files

In [None]:
cell = "b1c0"
# skip(1) is important to keep the first record of each cell out, which is all Nan/0
raw_dataset = tf.data.TFRecordDataset(["Data/tfrecords/" + cell + ".tfrecord"]).skip(1)

In [None]:
# VERSION 1
# reading manually

feature_description = {
    "IR": tf.io.FixedLenFeature(shape=[], dtype=tf.float32),
    "Qdlin": tf.io.FixedLenFeature(shape=[1000], dtype=tf.float32),
    "Tdlin": tf.io.FixedLenFeature(shape=[1000], dtype=tf.float32),
    "Remaining_cycles": tf.io.FixedLenFeature(shape=[], dtype=tf.int64)
}
    
def _parse_features(example_proto):
    examples = tf.io.parse_single_example(example_proto, feature_description)
    targets = examples.pop("Remaining_cycles")
    return examples, targets

dataset = raw_dataset.map(_parse_features).batch(5).shuffle(1000)

In [None]:
for example, target in dataset.take(10):
    print(target)

In [None]:
# VERSION 2
# reading with feature columns

ir = numeric_column("IR", shape=[])
qdlin = numeric_column("Qdlin", shape=[1000])
tdlin = numeric_column("Tdlin", shape=[1000])
rem_cycles = numeric_column("Remaining_cycles", shape=[], dtype=tf.int64)
columns = [ir, qdlin, tdlin, rem_cycles]

def _parse_features(example_proto):
    examples = tf.io.parse_single_example(example_proto, make_parse_example_spec(columns))
    targets = examples.pop("Remaining_cycles")
    return examples, targets

dataset = raw_dataset.map(_parse_features).batch(5).shuffle(1000).repeat(5)

In [None]:
for examples, target in dataset.take(10):
    print(target)

# Writing example with FeatureList

Summarizing the Xdlin-features with FeatureList might be helpful if we wanted to keep information about the sequence data on the detail-level.

In [None]:
import tensorflow as tf
from tensorflow.train import FloatList
from tensorflow.train import Feature, Features, FeatureList, FeatureLists, SequenceExample

In [None]:
# Write 
cell = batch1["b1c0"]

ir = Feature(float_list=FloatList(value=[cell["summary"]["IR"][1]]))
qdlin = Feature(float_list=FloatList(value=cycle["Qdlin"]))
tdlin = Feature(float_list=FloatList(value=cycle["Tdlin"]))

detail_features = FeatureList(feature=[qdlin, tdlin])

cycle_example = SequenceExample(
    context = Features(feature={"IR":ir}),
    feature_lists = FeatureLists(feature_list={"Details":detail_features})
)

with tf.io.TFRecordWriter("my_aligned_cycle.tfrecord") as f:
    f.write(cycle_example.SerializeToString())

In [None]:
# Read
context_feature_description = {
    "IR": tf.io.FixedLenFeature([], tf.float32, default_value=0)
}

sequence_feature_description = {
    "Details": tf.io.FixedLenSequenceFeature([1000], tf.float32),
}

for serialized_example in tf.data.TFRecordDataset(["my_aligned_cycle.tfrecord"]):
    parsed_example = tf.io.parse_single_sequence_example(
        serialized_example,
        context_feature_description,
        sequence_feature_description
    )
    print(parsed_example[1]["Details"][0])