In [35]:
import numpy as np
import pandas as pd
import tensorflow as tf
from   tensorflow.python.keras.datasets import fashion_mnist

tf.executing_eagerly()

True

### Numpy Arrays with Datasets

In [3]:
n_items = 11
n_list1 = np.arange(n_items)
n_list2 = np.arange(n_items, n_items*2)
n_list1, n_list2

(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10]),
 array([11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21]))

In [4]:
n_list1_dataset = tf.data.Dataset.from_tensor_slices(n_list1)
n_list1_dataset

<TensorSliceDataset shapes: (), types: tf.int64>

In [5]:
iterator = tf.compat.v1.data.make_one_shot_iterator(n_list1_dataset)
iterator

<tensorflow.python.data.ops.iterator_ops.IteratorV2 at 0x1375fe150>

In [6]:
for item in n_list1_dataset:
    n = iterator.get_next().numpy()
    print(n)

0
1
2
3
4
5
6
7
8
9
10


In [7]:
n_list1_dataset = tf.data.Dataset\
    .from_tensor_slices(n_list1)\
    .batch(3, drop_remainder=False)
iterator = tf.compat.v1.data.make_one_shot_iterator(n_list1_dataset)
for i in n_list1_dataset:
    n = iterator.get_next().numpy()
    print(n)

[0 1 2]
[3 4 5]
[6 7 8]
[ 9 10]


In [8]:
ds1 = [1, 2, 3, 4, 5]
ds2 = ['a', 'e', 'i', 'o', 'u']
ds1 = tf.data.Dataset.from_tensor_slices(ds1)
ds2 = tf.data.Dataset.from_tensor_slices(ds2)
zipped_ds = tf.data.Dataset.zip((ds1, ds2))
iterator = tf.compat.v1.data.make_one_shot_iterator(zipped_ds)
for i in zipped_ds:
    n = iterator.get_next()
    print(n)

(<tf.Tensor: id=75, shape=(), dtype=int32, numpy=1>, <tf.Tensor: id=76, shape=(), dtype=string, numpy=b'a'>)
(<tf.Tensor: id=79, shape=(), dtype=int32, numpy=2>, <tf.Tensor: id=80, shape=(), dtype=string, numpy=b'e'>)
(<tf.Tensor: id=83, shape=(), dtype=int32, numpy=3>, <tf.Tensor: id=84, shape=(), dtype=string, numpy=b'i'>)
(<tf.Tensor: id=87, shape=(), dtype=int32, numpy=4>, <tf.Tensor: id=88, shape=(), dtype=string, numpy=b'o'>)
(<tf.Tensor: id=91, shape=(), dtype=int32, numpy=5>, <tf.Tensor: id=92, shape=(), dtype=string, numpy=b'u'>)


In [9]:
ds1 = tf.data.Dataset.from_tensor_slices([1, 2, 3, 5])
ds2 = tf.data.Dataset.from_tensor_slices([19, 23, 29])
ds3 = ds1.concatenate(ds2)
print(ds3)

<ConcatenateDataset shapes: (), types: tf.int32>


In [10]:
iterator = tf.compat.v1.data.make_one_shot_iterator(ds3)
for i in range(7):
    n = iterator.get_next()
    print(n)

tf.Tensor(1, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(3, shape=(), dtype=int32)
tf.Tensor(5, shape=(), dtype=int32)
tf.Tensor(19, shape=(), dtype=int32)
tf.Tensor(23, shape=(), dtype=int32)
tf.Tensor(29, shape=(), dtype=int32)


In [11]:
epochs = 2
for e in range(epochs):
    for i in ds3:
        print(i)

tf.Tensor(1, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(3, shape=(), dtype=int32)
tf.Tensor(5, shape=(), dtype=int32)
tf.Tensor(19, shape=(), dtype=int32)
tf.Tensor(23, shape=(), dtype=int32)
tf.Tensor(29, shape=(), dtype=int32)
tf.Tensor(1, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(3, shape=(), dtype=int32)
tf.Tensor(5, shape=(), dtype=int32)
tf.Tensor(19, shape=(), dtype=int32)
tf.Tensor(23, shape=(), dtype=int32)
tf.Tensor(29, shape=(), dtype=int32)


### CSV

In [12]:
# make a test file
df = pd.DataFrame({'a': [3.14159, 2.171, 0.12345], 
                   'b': [0.1, 1.2, 3.5],
                   'c': [1.2, 4.8, 16.32],
                   'd': ['dogs', 'dirt', 'daliances']})
df

Unnamed: 0,a,b,c,d
0,3.14159,0.1,1.2,dogs
1,2.171,1.2,4.8,dirt
2,0.12345,3.5,16.32,daliances


In [13]:
df.to_csv('test.csv', index=False)

In [14]:
filename = ['./test.csv']
record_defaults = [tf.float32] * 2
dataset = tf.data.experimental.CsvDataset(
    filename, record_defaults, header=True, select_cols=[0, 2]) # a&c only
for i in dataset:
    print(i)

(<tf.Tensor: id=150, shape=(), dtype=float32, numpy=3.14159>, <tf.Tensor: id=151, shape=(), dtype=float32, numpy=1.2>)
(<tf.Tensor: id=152, shape=(), dtype=float32, numpy=2.171>, <tf.Tensor: id=153, shape=(), dtype=float32, numpy=4.8>)
(<tf.Tensor: id=154, shape=(), dtype=float32, numpy=0.12345>, <tf.Tensor: id=155, shape=(), dtype=float32, numpy=16.32>)


In [15]:
record_defaults += [tf.string]
dataset = tf.data.experimental.CsvDataset(
    filename, record_defaults, header=True, select_cols=[0, 2, 3])
for item in dataset:
    print(item[0].numpy(), item[1].numpy(), item[2].numpy().decode())

3.14159 1.2 dogs
2.171 4.8 dirt
0.12345 16.32 daliances


### TFRecords (binaries)

In [16]:
data = np.array([0., 1., 2., 3., 4., 5.])

In [17]:
def np_to_tfrecord(fname, data):
    writer = tf.io.TFRecordWriter(fname)
    feature = {}
    feature['data'] = tf.train.Feature(
        float_list=tf.train.FloatList(value=data))
    example = tf.train.Example(
        features=tf.train.Features(feature=feature))
    serialized = example.SerializeToString()
    writer.write(serialized)
    writer.close()

In [18]:
np_to_tfrecord('./trf_test.tfrecords', data)

In [19]:
!cat trf_test.tfrecords

(       �pJ
&
$
data
      �?   @  @@  �@  �@ݖ�!

In [20]:
dataset = tf.data.TFRecordDataset('./trf_test.tfrecords')

In [21]:
def parse_tfr(proto):
    keys_to_features = {
        'data': tf.io.FixedLenSequenceFeature(
            [], dtype=tf.float32, allow_missing=True)}
    parsed = tf.io.parse_single_example(serialized=proto, 
                                        features=keys_to_features)
    return parsed['data']

In [22]:
dataset = dataset.map(parse_tfr)
iterator = tf.compat.v1.data.make_one_shot_iterator(dataset)
# array retrieved as one item
item = iterator.get_next()
print(item)
print(item.numpy())
print(item[2].numpy())

tf.Tensor([0. 1. 2. 3. 4. 5.], shape=(6,), dtype=float32)
[0. 1. 2. 3. 4. 5.]
2.0


In [23]:
filename = './trf_test2.tfrecords'
data = {'ID': 12345,
        'Name': ['Bob', 'Dobolina'],
        'Scores': [0., 0., 27.7]}

In [24]:
ID = tf.train.Feature(int64_list=tf.train.Int64List(value=[data['ID']]))
Name = tf.train.Feature(
    bytes_list=tf.train.BytesList(
        value=[n.encode('utf-8') for n in data['Name']]))
Scores = tf.train.Feature(
    float_list=tf.train.FloatList(value=data['Scores']))
example = tf.train.Example(
    features=tf.train.Features(
        feature={'ID': ID, 'Name': Name, 'Scores': Scores}))

In [25]:
writer = tf.io.TFRecordWriter(filename)
writer.write(example.SerializeToString())
writer.close()

In [26]:
dataset = tf.data.TFRecordDataset(filename)

In [27]:
def parse_tfr2(proto):
    keys_to_features = {'ID': tf.io.FixedLenFeature([], dtype=tf.int64),
                        'Name': tf.io.VarLenFeature(dtype=tf.string),
                        'Scores': tf.io.VarLenFeature(dtype=tf.float32)}
    parsed = tf.io.parse_single_example(serialized=proto, 
                                        features=keys_to_features)
    return parsed['ID'], parsed['Name'], parsed['Scores']

In [28]:
dataset.map(parse_tfr2)
iterator = tf.compat.v1.data.make_one_shot_iterator(dataset)
item = iterator.get_next()
# record retrieved as single item
print(item)

tf.Tensor(b'\nE\n\x0c\n\x02ID\x12\x06\x1a\x04\n\x02\xb9`\n\x1a\n\x06Scores\x12\x10\x12\x0e\n\x0c\x00\x00\x00\x00\x00\x00\x00\x00\x9a\x99\xddA\n\x19\n\x04Name\x12\x11\n\x0f\n\x03Bob\n\x08Dobolina', shape=(), dtype=string)


In [31]:
# ..huh... not quite right...
#print('ID:', item[0].numpy())

### One-Hot Encoding

In [32]:
y = 5
y_1hot = tf.one_hot(y, depth=10).numpy()
y_1hot

array([0., 0., 0., 0., 0., 1., 0., 0., 0., 0.], dtype=float32)

In [36]:
width, height = 28, 28
n_classes = 10

In [38]:
(X_train, y_train), (X_test, y_test) = fashion_mnist.load_data()
split = 50000
(y_train, y_valid) = y_train[:split], y_train[split:]

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/train-labels-idx1-ubyte.gz
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/train-images-idx3-ubyte.gz
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/t10k-labels-idx1-ubyte.gz
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/t10k-images-idx3-ubyte.gz


In [39]:
y_train_1h = tf.one_hot(y_train, depth=n_classes).numpy()
y_valid_1h = tf.one_hot(y_valid, depth=n_classes).numpy()
y_test_1h  = tf.one_hot(y_test,  depth=n_classes).numpy()

i = 5
y_train[i], y_train_1h[i]

(2, array([0., 0., 1., 0., 0., 0., 0., 0., 0., 0.], dtype=float32))