# TensorFlow Dataset

[TensorFlow Datasets](https://www.tensorflow.org/datasets/overview)

### tf.data.Dataset

tf.data.Dataset API is the first and foremost API you should understand when using TensorFlow. When I started using TensorFlow, it was quite hard to understand what it is and I was stuck to modeling and testing. However, if you don't understand the tf.data.Dataset you cannot create your own dataset for modeling or testing. 

According the the official documentation, tf.data.Dataset API provides below three things: 

1. Create a source dataset from your input data.
2. Apply dataset transformations to preprocess the data.
3. Iterate over the dataset and process the elements.


In [57]:
import tensorflow.compat.v2 as tf
import tensorflow_datasets as tfds
from tensorflow import keras

from PIL import Image
import numpy as np
import matplotlib.pyplot as plt
import os
import pandas as pd
from IPython.display import display, HTML
import requests
import csv

def import_mnist_dataset(log=False):

    fashion_mnist = keras.datasets.fashion_mnist
    (train_images, train_labels), (test_images, test_labels) = fashion_mnist.load_data()

    if log:
        print("type of images  : ", type(train_images))
        print("shape of images : ", train_images.shape)
        print("type of label   : ", type(train_labels))
        print("shape of label  : ", train_labels.shape)

        print("type of images  : ", type(test_images))
        print("shape of images : ", test_images.shape)
        print("type of label   : ", type(test_labels))
        print("shape of label  : ", test_labels.shape)

        print("sampel Image")
        plt.imshow(train_images[0])
        
    return (train_images, train_labels), (test_images, test_labels)


def download_iris_dataset(log=False):
    train_dataset_url = "https://storage.googleapis.com/download.tensorflow.org/data/iris_training.csv"

    train_dataset_file_path = tf.keras.utils.get_file(
        fname=os.path.basename(train_dataset_url),
        origin=train_dataset_url
    )

    if log:
        print("Local copy of the dataset file: {}".format(train_dataset_file_path))

        data = pd.read_csv(train_dataset_file_path)
        display(data.head())
        
    return train_dataset_file_path


def parse_iris_dataset(train_dataset_file_path, log=False, image_display=True):
    """
    tf.data.experimental.make_csv_dataset()
    https://www.tensorflow.org/api_docs/python/tf/data/experimental/make_csv_dataset
    """
    
    # column order in CSV file
    column_names = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'species']

    feature_names = column_names[:-1]
    label_name = column_names[-1]
    
    batch_size = 32

    train_dataset = tf.data.experimental.make_csv_dataset(
        train_dataset_file_path,
        batch_size,
        column_names=column_names,
        label_name=label_name,
        num_epochs=1)
    
    # Extract first batch.
    # As batch_size = 32, train_dataset is iterative.
    features, labels = next(iter(train_dataset))

    # Display scatter plot of the data.
    if image_display:
        plt.scatter(features['petal_length'],
                    features['sepal_length'],
                    c=labels,
                    cmap='viridis')

        plt.xlabel("Petal length")
        plt.ylabel("Sepal length")
        plt.show()
    
    if log:
        print("Features: {}".format(feature_names))
        print("Label: {}".format(label_name))

        print("Type  : ", type(train_dataset))
        
        print("features : ", features["petal_length"])
        print("labels   : ", labels)
    
    return train_dataset


def execute():

    ds = download_iris_dataset()
    parse_iris_dataset(ds, log=True)



In [4]:

def import_mnist_dataset_info(log=False):
    ds, info = tfds.load('mnist', split='train', shuffle_files=True, with_info=True)
    return ds, info


def display_dataset(ds):
    """Check inside of dataset."""
    for d in ds.take(1):
        print(list(d.keys()))
        image = d["image"]
        label = d["label"]
        print(image.shape, label)

        
def import_dataset_as_numpy():
    # as_supervised=True: Output 2-tuple structure (input, label)
    ds = tfds.load("mnist", split="train", as_supervised=True)
    for image, label in tfds.as_numpy(ds.take(1)):
        print(type(image), type(label), label)


def visualize_dataset():
    """Visualize mnist dataset.

    tfds.visualization.show_examples()
    https://www.tensorflow.org/datasets/api_docs/python/tfds/visualization/show_examples
    """
    ds, info = tfds.load("mnist", split="train", with_info=True)
    fig = tfds.show_examples(ds, info)
    
def cats_and_dog_dataset_load(log=False):
    """Load dataset
    
    Example
    -------
    (raw_train, raw_validation, raw_test), metadata = load_training_dataset(log=True)

    """

    # Each data is tf.data.Dataset.
    # Metadata is info data.
    (raw_train, raw_validation, raw_test), metadata = tfds.load(
        'cats_vs_dogs',
        split=['train[:80%]', 'train[80%:90%]', 'train[90%:]'],
        with_info=True,
        as_supervised=True,
    )
    
    if log:
        print("raw_train : ", raw_train)
        print("Type      : ", type(raw_train))
    
    return (raw_train, raw_validation, raw_test), metadata
    