# [$tf.data.Dataset$](https://www.tensorflow.org/api_docs/python/tf/data/Dataset)

Represents a potentially large set of elements.

In [None]:
import os
import tensorflow as tf

### **Create a $tf.data.Dataset$ from a given list of elements.**

In [None]:
elements = [21, 22, -108, 31, -1, 32, 34, 31]
tf_dataset = tf.data.Dataset.from_tensor_slices(elements)
print(tf_dataset)

### **Iterate through $tf.data.Dataset$.**

In [None]:
for item in tf_dataset:
    print(item.numpy())

### **Iterate through all elements as NumPy elements.**

In [None]:
for item in tf_dataset.as_numpy_iterator():
    print(item)

### **Iterate through the first "$n$" elements in the $tf.data.Dataset$.**

In [None]:
for item in tf_dataset.take(3):
    print(item.numpy())

### **Filter elements that are greater than 0.**

In [None]:
tf_dataset = tf_dataset.filter(lambda x: x > 0)
for item in tf_dataset.as_numpy_iterator():
    print(item)

### **Multiply each element with a value of 10.**

In [None]:
tf_dataset = tf_dataset.map(lambda x: x * 10)
for item in tf_dataset.as_numpy_iterator():
    print(item)

### **Shuffle elements in the $tf.data.Dataset$.**

In [None]:
tf_dataset = tf_dataset.shuffle(2)
for item in tf_dataset.as_numpy_iterator():
    print(item)

### **Batching elements in the $tf.data.Dataset$.**

In [None]:
for item_batch in tf_dataset.batch(2):
    print(item_batch.numpy())

### **Perform all of the above operations in one shot.**

In [None]:
tf_dataset = tf.data.Dataset.from_tensor_slices(elements)

tf_dataset = (
    tf_dataset.filter(lambda x: x > 0).map(lambda y: y * 10).shuffle(2).batch(2)
)

for item in tf_dataset.as_numpy_iterator():
    print(item)

# **Load CSV Dataset.**

In [None]:
import pandas as pd

data = pd.read_csv(
    "https://storage.googleapis.com/download.tensorflow.org/data/abalone_train.csv",
    names=[
        "Length",
        "Diameter",
        "Height",
        "Whole Weight",
        "Shucked Weight",
        "Viscera Weight",
        "Shell Weight",
        "Age",
    ],
)

data.head()

In [None]:
# Save into .CSV File.
data.to_csv("abalone_train.csv")

### **BLOG:** [**Stop using CSVs for Storage. Pickle is an 80 times faster alternative.**](https://towardsdatascience.com/stop-using-csvs-for-storage-pickle-is-an-80-times-faster-alternative-832041bbc199)

In [None]:
# Save into Pickle File.
pd.to_pickle(data, "train.pkl")

# Read the Pickle File.
data = pd.read_pickle("train.pkl")
data.head()

In [None]:
import pickle

# Dump Pickle File.
with open("abalone_train.pkl", "wb") as f:
    pickle.dump("abalone_train.csv", f)

# Load Pickle File.
with open("abalone_train.pkl", "rb") as f:
    df = pickle.load(f)

# **Handling Images.**

> [**Kaggle Dataset**](https://www.kaggle.com/datasets/shaunthesheep/microsoft-catsvsdogs-dataset)

In [None]:
# Install Kaggle.
!pip install --upgrade --force-reinstall --no-deps kaggle

In [None]:
# Files Upload.
from google.colab import files

files.upload()

In [None]:
# Create a Kaggle Folder.
!mkdir ~/.kaggle

# Copy the kaggle.json to the folder created.
!cp kaggle.json ~/.kaggle/

# Permission for the json file to act.
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
# Dataset Download.
!kaggle datasets download -d shaunthesheep/microsoft-catsvsdogs-dataset

In [None]:
# Unzip Dataset.
!unzip microsoft-catsvsdogs-dataset.zip

### **Load Images using $tf.data.Dataset$.**

In [None]:
# Load Images to tf.data.Dataset.
images_ds = tf.data.Dataset.list_files("PetImages/*/*", shuffle=False)

In [None]:
# Image Count.
image_count = len(images_ds)
print(image_count)

In [None]:
# Shuffle and Iterate through the first "n" elements in the tf.data.Dataset.
images_ds = images_ds.shuffle(500)
for file in images_ds.take(3):
    print(file.numpy())

In [None]:
class_names = ["Cat", "Dog"]

In [None]:
# Split Dataset into Training and Test Set.
train_size = int(image_count * 0.8)

train_ds = images_ds.take(train_size)
test_ds = images_ds.skip(train_size)

print(len(train_ds), len(test_ds))

In [None]:
def get_label(file_path):
    parts = tf.strings.split(file_path, os.path.sep)
    return parts[-2]


def process_image(file_path):
    label = get_label(file_path)
    img = tf.io.read_file(file_path)
    img = tf.image.decode_jpeg(img)
    img = tf.image.resize(img, [128, 128])
    return img, label

In [None]:
get_label("/content/PetImages/Dog/10037.jpg")

In [None]:
img, label = process_image("/content/PetImages/Dog/10037.jpg")
img.numpy()[:2]

In [None]:
train_ds = train_ds.map(process_image)
test_ds = test_ds.map(process_image)

In [None]:
for image, label in train_ds.take(1):
    print("****", image)
    print("****", label)

In [None]:
# Normalize the Image Values.
def scale(image, label):
    return image / 255, label


train_ds = train_ds.map(scale)

for image, label in train_ds.take(5):
    print("****Image: ", image.numpy()[0][0])
    print("****Label: ", label.numpy())

<h3  align="center" style="color:blue"><b>TF Data Input Pipeline: Exercise Solution</b></h3>

Movie reviews are present as individual text files (one file per review) in the review folder.

Folder structure looks like this,

**[reviews](https://github.com/codebasics/deep-learning-keras-tf-tutorial/tree/master/44_tf_data_pipeline/Exercise/reviews)**

    |__ positive
        |__pos_1.txt
        |__pos_2.txt
        |__pos_3.txt

    |__ negative
        |__neg_1.txt
        |__neg_2.txt
        |__neg_3.txt


We need to read these reviews using $tf.data.Dataset$ and perform the following transformations.

1.    Read text review and generate a label from the folder name. The dataset should have review text and label as a tuple.

2.   Filter blank text review. Two files are blank in this dataset.

3.   Do all of the above transformations in a single line of code. Also, shuffle all the reviews.

In [None]:
# Retrieve and View review file paths in a TensorFlow Dataset.
reviews_ds = tf.data.Dataset.list_files("reviews/*/*", shuffle=False)

In [None]:
for file in reviews_ds:
    print(file.numpy())

In [None]:
def extract_review_and_label(file_path):
    return tf.io.read_file(file_path), tf.strings.split(file_path, os.path.sep)[-2]

### **Extract review text from these files. Extract the label from the folder name.**

In [None]:
reviews_ds_1 = reviews_ds.map(extract_review_and_label)

for review, label in reviews_ds_1:
    print("Review: ", review.numpy()[:50])
    print("Label: ", label.numpy())

### **Filter Blank Reviews.**

In [None]:
reviews_ds_2 = reviews_ds_1.filter(lambda review, label: review != "")

for review, label in reviews_ds_2.as_numpy_iterator():
    print("Review: ", review[:50])
    print("Label: ", label)

### **Perform $map()$, $filter()$, and $shuffle()$ all in a single line of code.**

In [None]:
final_ds = (
    reviews_ds.map(extract_review_and_label)
    .filter(lambda review, label: review != "")
    .shuffle(2)
)

for review, label in final_ds.as_numpy_iterator():
    print("Review:", review[:50])
    print("Label:", label)

## [**Tensorflow Input Pipeline | TF Dataset**](https://www.youtube.com/watch?v=VFEOskzhhbc&list=PLeo1K3hjS3uu7CxAacxVndI4bE_o3BDtO&index=44)