In [4]:
import tensorflow as tf
from tensorflow import keras
import numpy as np
import pandas as pd
import glob 

In [14]:
X = tf.range(10)
dataset = tf.data.Dataset.from_tensor_slices(X)
dataset = dataset.repeat(3)
dataset = dataset.map(lambda x: x * 2)
dataset = dataset.shuffle(buffer_size = 5).batch(7, drop_remainder = True)
for item in dataset:
    print(item)

tf.Tensor([ 6  0 10  2  4  8 12], shape=(7,), dtype=int32)
tf.Tensor([16  0  4  6 10 14  2], shape=(7,), dtype=int32)
tf.Tensor([18 18 12  8  0 14  4], shape=(7,), dtype=int32)
tf.Tensor([ 8  6 12 10 16 18  2], shape=(7,), dtype=int32)


In [90]:
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

housing = fetch_california_housing()
X_train_full, X_test, y_train_full, y_test = train_test_split(
    housing.data, housing.target.reshape(-1, 1), random_state=42)
X_train, X_valid, y_train, y_valid = train_test_split(
    X_train_full, y_train_full, random_state=42)

scaler = StandardScaler()
scaler.fit(X_train)
X_mean = scaler.mean_
X_std = scaler.scale_

In [89]:
n_readers = 5
dataset = filepath_dataset.interleave(
    lambda filepath: tf.data.TextLineDataset(filepath).skip(1),
    cycle_length=n_readers,
    num_parallel_calls=tf.data.experimental.AUTOTUNE
)

In [101]:
def save_to_multiple_csv_files(data, name_prefix, header=None, n_parts=10):
    housing_dir = os.path.join("datasets", "housing")
    os.makedirs(housing_dir, exist_ok=True)
    path_format = os.path.join(housing_dir, "my_{}_{:02d}.csv")

    filepaths = []
    m = len(data)
    for file_idx, row_indices in enumerate(np.array_split(np.arange(m), n_parts)):
        part_csv = path_format.format(name_prefix, file_idx)
        filepaths.append(part_csv)
        with open(part_csv, "wt", encoding="utf-8") as f:
            if header is not None:
                f.write(header)
                f.write("\n")
            for row_idx in row_indices:
                f.write(",".join([repr(col) for col in data[row_idx]]))
                f.write("\n")
    return filepaths
train_data = np.c_[X_train, y_train]
valid_data = np.c_[X_valid, y_valid]
test_data = np.c_[X_test, y_test]
header_cols = housing.feature_names + ["MedianHouseValue"]
header = ",".join(header_cols)

train_filepaths = save_to_multiple_csv_files(train_data, "train", header, n_parts=20)
valid_filepaths = save_to_multiple_csv_files(valid_data, "valid", header, n_parts=10)
test_filepaths = save_to_multiple_csv_files(test_data, "test", header, n_parts=10)

In [102]:
n_inputs = 8
def preprocess(line):
    defs = [0.] * n_inputs + [tf.constant([], dtype = tf.float32)]
    fields = tf.io.decode_csv(line, record_defaults=defs)
    x = tf.stack(fields[:-1])
    y = tf.stack(fields[-1:])
    print(x)
    return (x-X_mean)/X_std,y

preprocess(b'4.2083,44.0,5.3232,0.9171,846.0,2.3370,37.47,-122.2,2.782')


tf.Tensor(
[   4.2083   44.        5.3232    0.9171  846.        2.337    37.47
 -122.2   ], shape=(8,), dtype=float32)


(<tf.Tensor: shape=(8,), dtype=float32, numpy=
 array([ 0.16579159,  1.216324  , -0.05204564, -0.39215982, -0.5277444 ,
        -0.2633488 ,  0.8543046 , -1.3072058 ], dtype=float32)>,
 <tf.Tensor: shape=(1,), dtype=float32, numpy=array([2.782], dtype=float32)>)

In [103]:
def csv_reader_dataset(filepaths, repeat = 1, n_readers = 5, n_read_threads = None, shuffle_buffer_size = 10000, n_parse_threads =5,batch_size=32):
    dataset = tf.data.Dataset.list_files(filepaths)
    dataset = dataset.interleave(
        lambda filepath : tf.data.TextLineDataset(filepath).skip(1),
        cycle_length = n_readers, num_parallel_calls = n_read_threads
    )
    dataset = dataset.map(preprocess, num_parallel_calls = n_parse_threads)
    dataset = dataset.shuffle(shuffle_buffer_size).repeat(repeat)
    return dataset.batch(batch_size).prefetch(1)

In [110]:
train_set = csv_reader_dataset(train_filepaths, repeat=None)
valid_set = csv_reader_dataset(valid_filepaths)
test_set = csv_reader_dataset(test_filepaths)

Tensor("stack:0", shape=(8,), dtype=float32)
Tensor("stack:0", shape=(8,), dtype=float32)
Tensor("stack:0", shape=(8,), dtype=float32)


In [111]:
model = keras.models.Sequential([
    keras.layers.Dense(30, activation="relu", input_shape=X_train.shape[1:]),
    keras.layers.Dense(1),
])
model.compile(loss="mse", optimizer = keras.optimizers.SGD(learning_rate=1e-3))
model.fit(train_set, steps_per_epoch=len(X_train)//32, epochs= 10, validation_data= valid_set)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x259c6d0c520>

In [113]:
model.evaluate(test_set)
new_set = test_set.take(3)
model.predict(new_set)



array([[1.9837748],
       [1.2196411],
       [1.2460581],
       [2.6631696],
       [2.5247085],
       [2.1588447],
       [2.4803543],
       [1.7593637],
       [1.5929253],
       [6.3101254],
       [3.273668 ],
       [1.482578 ],
       [1.7871768],
       [0.6937831],
       [2.0335884],
       [4.1171155],
       [1.411269 ],
       [1.2546948],
       [2.7395356],
       [3.3186095],
       [1.004983 ],
       [1.1807188],
       [2.062243 ],
       [2.7237601],
       [2.729326 ],
       [1.0754367],
       [1.6993295],
       [2.0443444],
       [1.6621448],
       [1.4986268],
       [1.8680217],
       [2.7993684],
       [0.9799485],
       [1.5730546],
       [2.8173032],
       [2.634295 ],
       [2.6624436],
       [2.2537723],
       [2.5078762],
       [1.3064256],
       [2.443819 ],
       [1.5686886],
       [3.9725018],
       [2.2104964],
       [2.1460476],
       [2.0177686],
       [1.2622101],
       [2.7350864],
       [1.0128222],
       [1.352803 ],


In [115]:
with tf.io.TFRecordWriter('my_data.tfrecord')as f:
    f.write(b'This is the first record')
    f.write(b'And this is the second record')

In [121]:
filepaths = ['my_data.tfrecord']
dataset = tf.data.TFRecordDataset(filepaths)
for item in dataset:
    print(item)

tf.Tensor(b'This is the first record', shape=(), dtype=string)
tf.Tensor(b'And this is the second record', shape=(), dtype=string)


<TFRecordDatasetV2 shapes: (), types: tf.string>

In [123]:
options = tf.io.TFRecordOptions(compression_type='GZIP')
with tf.io.TFRecordWriter('my_compressed.tfrecord', options) as f:
    f.write(b'This is the first record')
    f.write(b'And this is the second record')

In [124]:
dataset = tf.data.TFRecordDataset(["my_compressed.tfrecord"], compression_type="GZIP")
for item in dataset:
    print(item)

tf.Tensor(b'This is the first record', shape=(), dtype=string)
tf.Tensor(b'And this is the second record', shape=(), dtype=string)


In [16]:
from tensorflow.train import BytesList, FloatList, Int64List
from tensorflow.train import Feature, Features, Example

person_example = Example(
    features = Features(
        feature = {
            "name":Feature(bytes_list=BytesList(value=[b"Alice"])),
            "id":Feature(int64_list=Int64List(value=[123])),
            "email":Feature(bytes_list=BytesList(value=[b"a@b.com",b"c@d.com"]))
        }
    )
)

with tf.io.TFRecordWriter("my_contacts.tfrecord") as f:
    f.write(person_example.SerializeToString())

In [131]:
feature_description = {
    "name": tf.io.FixedLenFeature([], tf.string, default_value=""),
    "id": tf.io.FixedLenFeature([],tf.int64, default_value=0),
    "email": tf.io.VarLenFeature(tf.string),
}
for serialized_example in tf.data.TFRecordDataset(['my_contacts.tfrecord']):
    parsed_example = tf.io.parse_single_example(serialized_example,feature_description)
    print(parsed_example["email"].values)

tf.Tensor([b'a@b.com' b'c@d.com'], shape=(2,), dtype=string)


In [17]:
class Standardization(keras.layers.Layer):
    def adapt(self, data_sample):
        self.means_ = np.mean(data_sample, axis = 0, keepdims=True)
        self.std_ = np.std(data_sample, axis = 0, keepdims=True)
    def call(self, inputs):
        return (inputs - self.means_)/(self.std_ + keras.backend.epsilon())

In [154]:
std_layer = Standardization()
std_layer.adapt(X_train)
model = keras.Sequential([
    std_layer,
    keras.layers.Dense(30, input_shape = X_train.shape[1:], activation = "relu"),
    keras.layers.Dense(1)
])
model.compile(loss = "mse", optimizer = keras.optimizers.SGD(learning_rate=1e-3))
model.fit(x=X_train,y=y_train,epochs=10, batch_size = 32)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x259c784e520>

In [3]:
import tensorflow_datasets as tfds

dataset = tfds.load(name="mnist", batch_size=32, as_supervised=True)
mnist_train = dataset['train'].repeat().prefetch(1)
model = keras.Sequential([
    keras.layers.Flatten(input_shape = [28,28,1]),
    keras.layers.Dense(30, activation = 'relu'),
    keras.layers.Dense(10, activation = 'softmax')
])
model.compile(loss = 'sparse_categorical_crossentropy', optimizer = keras.optimizers.SGD(learning_rate=1e-3))
model.fit(mnist_train,steps_per_epoch=60000//32, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x1ec5e9344f0>

### Exercises


1. Speed and abstraction
2. file size, interleaving, memory
3. Preprocessing takes longer than it should, preprocess data before training
4. Any binary data
5. You can use already predefined structures
6. 
7. 
8. One hot encoding and textvectorization, bag of words TF-IDF

In [41]:
from pathlib import Path

DOWNLOAD_ROOT = "http://ai.stanford.edu/~amaas/data/sentiment/"
FILENAME = "aclImdb_v1.tar.gz"
filepath = keras.utils.get_file(FILENAME, DOWNLOAD_ROOT + FILENAME, extract=True)
path = Path(filepath).parent / "aclImdb"
path

Downloading data from http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz


WindowsPath('C:/Users/carlos/.keras/datasets/aclImdb')

In [42]:
def review_paths(dirpath):
    return [str(path) for path in dirpath.glob("*.txt")]

train_pos = review_paths(path / "train" / "pos")
train_neg = review_paths(path / "train" / "neg")
test_valid_pos = review_paths(path / "test" / "pos")
test_valid_neg = review_paths(path / "test" / "neg")

len(train_pos), len(train_neg), len(test_valid_pos), len(test_valid_neg)

(12500, 12500, 12500, 12500)

In [43]:
train_pos = review_paths(path / "train" / "pos")
train_neg = review_paths(path / "train" / "neg")
test_valid_pos = review_paths(path / "test" / "pos")
test_valid_neg = review_paths(path / "test" / "neg")

len(train_pos), len(train_neg), len(test_valid_pos), len(test_valid_neg)

(12500, 12500, 12500, 12500)

In [44]:
np.random.shuffle(test_valid_pos)

test_pos = test_valid_pos[:5000]
test_neg = test_valid_neg[:5000]
valid_pos = test_valid_pos[5000:]
valid_neg = test_valid_neg[5000:]

In [47]:
def imdb_dataset(filepaths_positive, filepaths_negative):
    reviews = []
    labels = []
    for filepaths, label in ((filepaths_negative, 0), (filepaths_positive, 1)):
        for filepath in filepaths:
            with open(filepath, encoding='utf-8') as review_file:
                reviews.append(review_file.read())
            labels.append(label)
    return tf.data.Dataset.from_tensor_slices(
        (tf.constant(reviews), tf.constant(labels)))

In [48]:
for X, y in imdb_dataset(train_pos, train_neg).take(3):
    print(X)
    print(y)
    print()

tf.Tensor(b"Story of a man who has unnatural feelings for a pig. Starts out with a opening scene that is a terrific example of absurd comedy. A formal orchestra audience is turned into an insane, violent mob by the crazy chantings of it's singers. Unfortunately it stays absurd the WHOLE time with no general narrative eventually making it just too off putting. Even those from the era should be turned off. The cryptic dialogue would make Shakespeare seem easy to a third grader. On a technical level it's better than you might think with some good cinematography by future great Vilmos Zsigmond. Future stars Sally Kirkland and Frederic Forrest can be seen briefly.", shape=(), dtype=string)
tf.Tensor(0, shape=(), dtype=int32)

tf.Tensor(b"Airport '77 starts as a brand new luxury 747 plane is loaded up with valuable paintings & such belonging to rich businessman Philip Stevens (James Stewart) who is flying them & a bunch of VIP's to his estate in preparation of it being opened to the public a