In [2]:
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# **tf.data API**

tf.data() let us handle data of any type

we are going to use dataset class for handling all kind of data

we can apply all type of preprocessing on dataset object

In [3]:
# there are different ways of creating dataset. Overhere we are using from_tensor_slices function

dataset = tf.data.Dataset.from_tensor_slices([1, 2, 3, 4, 5, 6, 7, 8])
print(dataset)

<TensorSliceDataset shapes: (), types: tf.int32>


In [4]:
# there are different ways of creating dataset. Overhere we are using from_tensor_slices function

dataset = tf.data.Dataset.from_tensor_slices([[1, 2],[3, 4], [5, 6], [7, 8]])
print(dataset)

<TensorSliceDataset shapes: (2,), types: tf.int32>


In [5]:
# we can access each element inside the dataset object. The objet is iterable

for elem in dataset:
  print(elem)

tf.Tensor([1 2], shape=(2,), dtype=int32)
tf.Tensor([3 4], shape=(2,), dtype=int32)
tf.Tensor([5 6], shape=(2,), dtype=int32)
tf.Tensor([7 8], shape=(2,), dtype=int32)


In [6]:
# we can access each element inside the dataset object. The objet is iterable

for elem in dataset:
  print(elem.numpy())

[1 2]
[3 4]
[5 6]
[7 8]


In [7]:
# remember the dataset will always take first dimension as the dataset size. 
# 2nd argument will be the values or data /samples.
# so this dataset will've 128 elements, each of length 5 with random values sampled from uniform distribution.
dataset = tf.data.Dataset.from_tensor_slices(tf.random.uniform([128, 5])) 
# # 128 rows hongi and 5 cols hongy.
print(dataset.element_spec)

# the output shows each dataset has length 5 and type float32.

TensorSpec(shape=(5,), dtype=tf.float32, name=None)


In [8]:
dataset = tf.data.Dataset.from_tensor_slices(
    (tf.random.uniform([256, 5], minval = 1, maxval = 10, dtype=tf.int32),
     tf.random.normal([256]))
)
# 256 rows hongi and 5 cols hongy.
print(dataset.element_spec)
# here we r passing a tuple of tensor to create a dataset


(TensorSpec(shape=(5,), dtype=tf.int32, name=None), TensorSpec(shape=(), dtype=tf.float32, name=None))


In [9]:
dataset = tf.data.Dataset.from_tensor_slices(
    (tf.random.uniform([256, 5], minval = 1, maxval = 10, dtype=tf.int32),  # 256 rows and 5 cols
     tf.random.normal([256]))    
)

for elem in dataset.take(2):  # it will take the first 2 elements of the dataset. 
  print(elem)

(<tf.Tensor: shape=(5,), dtype=int32, numpy=array([1, 8, 6, 5, 5], dtype=int32)>, <tf.Tensor: shape=(), dtype=float32, numpy=-1.418302>)
(<tf.Tensor: shape=(5,), dtype=int32, numpy=array([8, 7, 3, 3, 7], dtype=int32)>, <tf.Tensor: shape=(), dtype=float32, numpy=-0.425806>)


In [10]:
tf.random.uniform([256, 5], minval = 1, maxval = 10, dtype=tf.int32)

<tf.Tensor: shape=(256, 5), dtype=int32, numpy=
array([[7, 7, 1, 4, 6],
       [4, 4, 9, 8, 1],
       [7, 6, 4, 1, 1],
       ...,
       [3, 1, 3, 7, 8],
       [8, 9, 4, 5, 6],
       [1, 7, 3, 6, 1]], dtype=int32)>

# For e.g.

Let's see tf.data on real world dataset

In [11]:
# tf.data with cifar10 dataset

from tensorflow.keras.datasets import cifar10

(X_train, y_train), (X_test, y_test) = cifar10.load_data()

Downloading data from https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz


In [12]:
print(X_train.shape)
print(y_train.shape)

(50000, 32, 32, 3)
(50000, 1)


In [13]:
# as u can see here from the shape , first dimension is always the length or size of the dataset.

dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train))
print(dataset.element_spec)

(TensorSpec(shape=(32, 32, 3), dtype=tf.uint8, name=None), TensorSpec(shape=(1,), dtype=tf.uint8, name=None))


we can iterate over this dataset by using for loop to see the data inside

In [28]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator

# we can also pass generator inside ImageDataGenerator.
# so lets create generator
img_datagen = ImageDataGenerator(width_shift_range=0.2, horizontal_flip = True)

# creating dataset object
# here we r passing in the generator instead of tensor. we will pass the genrator with .flow callable object. ,flow is used to return generator when called.
dataset = tf.data.Dataset.from_generator(img_datagen.flow, args = [X_train, y_train],  # args takes in the input which we usually pass inside the img_datagen generator. we usally pass data and labels.
                                         output_types = (tf.float32, tf.int32),   # first type is for inputs and 2nd type is for outputs
                                         output_shapes = ([32, 32, 32, 3], [32,1])   # finally here u can see we r giving expected output_shapes for batch of input and output. so qst dim 32 is batch_size 
                                         )


# **Tensorflow Dataset class Coding examples:**

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import os
import tensorflow as tf

In [32]:
# create a ddaaset
x = np.zeros((100, 10, 2, 2))

# create a dataset from tensor x
dataset1 = tf.data.Dataset.from_tensor_slices(x)

print(dataset1)
print(dataset1.element_spec)

<TensorSliceDataset shapes: (10, 2, 2), types: tf.float64>
TensorSpec(shape=(10, 2, 2), dtype=tf.float64, name=None)


In [33]:
x2 = [np.zeros((10,2, 2)), np.zeros((5,2,2))]
# this will give error blc the input and output elements are of different shapes.

# create a dataset from tensor x
dataset2 = tf.data.Dataset.from_tensor_slices(x2)

print(dataset2)
print(dataset2.element_spec)

ValueError: ignored

In [34]:
x2 = np.zeros((10,2, 2)) 
x2.shape

(10, 2, 2)

In [35]:
x2 = [np.zeros((10,2, 2)), np.zeros((5,2,2))]
x2[0].shape  # ak hi list me 2 array store hain.

(10, 2, 2)

In [36]:
# Now it will work blc now the shape is ok
x2 = [np.zeros((10, 1)), np.zeros((10, 1)), np.zeros((10, 1))]

In [37]:
dataset2 = tf.data.Dataset.from_tensor_slices(x2)
print(dataset2.element_spec)

TensorSpec(shape=(10, 1), dtype=tf.float64, name=None)


## **Combining two datasets into larger dataset, done with the help of Zipping**

In [38]:
# Combine 2 datasets into larger one
# Note : The two datasets that we are zipping needs to be in the same shape. 

dataset_zipped = tf.data.Dataset.zip((dataset1, dataset2)) # we will pass two datasets that we want to combine as tuple.

print(dataset_zipped.element_spec)

(TensorSpec(shape=(10, 2, 2), dtype=tf.float64, name=None), TensorSpec(shape=(10, 1), dtype=tf.float64, name=None))


In [40]:
# define a funct to find the num of batches in a dataset zipped

def get_batches(dataset):
  iter_dataset = iter(dataset)
  i = 0
  try:
    while next(iter_dataset):
      i = i + 1

  except:
    return i


In [41]:
# find the no of batches in the zipped dataset

get_batches(dataset_zipped)
# 3 is the no of batches of smaller dataset in the zipped_dataset i.e. dataset2
# notice if u do zip_dataset with no of different batches then the dataset with larget no of batches will be trimmed in order to accomodate smaller dataset.  

3

In [42]:
# creating dataset from real data.

# load the mnist dataset

(train_features, train_labels), (test_features, test_labels) = tf.keras.datasets.mnist.load_data()

print(type(train_features), type(train_labels))
print(train_features.shape, train_labels.shape)

<class 'numpy.ndarray'> <class 'numpy.ndarray'>
(60000, 28, 28) (60000,)


In [43]:
# create a dataset from the mnist data

mnist_dataset = tf.data.Dataset.from_tensor_slices((train_features, train_labels))


In [44]:
# Inspect the Dataset opbject

print(mnist_dataset.element_spec)
# resultant dataset is tuple. first element is the element of training features where the second element is the element of training label. 

(TensorSpec(shape=(28, 28), dtype=tf.uint8, name=None), TensorSpec(shape=(), dtype=tf.uint8, name=None))


In [45]:
# Inspect the lenght of an element using the take method

element = next(iter(mnist_dataset.take(1)))
print(len(element))
# blc we have 2 datasets.


2


In [46]:
# converting the shapes of our data
print(element[0].shape)
print(element[1].shape)

(28, 28)
()


# ***Create a dataset from the text data***

In [13]:
# print the list of text files.

text_files = sorted([f.path for f in os.scandir('/content/drive/My Drive/datasets/shakespeare')])

text_files

['/content/drive/My Drive/datasets/shakespeare/tempest.1.1.txt',
 '/content/drive/My Drive/datasets/shakespeare/tempest.1.2.txt',
 '/content/drive/My Drive/datasets/shakespeare/tempest.2.1.txt',
 '/content/drive/My Drive/datasets/shakespeare/tempest.2.2.txt',
 '/content/drive/My Drive/datasets/shakespeare/tempest.3.1.txt',
 '/content/drive/My Drive/datasets/shakespeare/tempest.3.2.txt',
 '/content/drive/My Drive/datasets/shakespeare/tempest.3.3.txt',
 '/content/drive/My Drive/datasets/shakespeare/tempest.4.1.txt',
 '/content/drive/My Drive/datasets/shakespeare/tempest.5.1.txt']

In [14]:
# load the first file using python and print the first 5 lines. 

with open(text_files[0], 'r') as fil:
  contents = (fil.readline() for i in range(5))
  for line in contents:
    print(line)

SCENE I. On a ship at sea: a tempestuous noise

of thunder and lightning heard.

Enter a Master and a Boatswain



Master



In [11]:
# Load the lines from the files into a dataset using TextLineDataset

shakespare_dataset = tf.data.TextLineDataset(text_files)
# you can pass here one file or multiple files. It's upto u. 

In [15]:
# Use the take method to get and print the first 5 lines of the dataset

first_5_lines_dataset = iter(shakespare_dataset.take(5)) # take 5 elements in the dataset from the first file. take() is used to iterate first element in the dataset
lines = [line for line in first_5_lines_dataset]
for line in lines:
    print(line)

# here we r loading the first 5 lines of the first file. does that mean we r not loading rest of the files.. we can check that 
# in the below code.

tf.Tensor(b'SCENE I. On a ship at sea: a tempestuous noise', shape=(), dtype=string)
tf.Tensor(b'of thunder and lightning heard.', shape=(), dtype=string)
tf.Tensor(b'Enter a Master and a Boatswain', shape=(), dtype=string)
tf.Tensor(b'', shape=(), dtype=string)
tf.Tensor(b'Master', shape=(), dtype=string)


In [19]:
# compute the number of lines in the first file

lines = []
with open(text_files[0], 'r') as fil:
  line = fil.readline()
  while line:
    # print(line)
    lines.append(line)
    line = fil.readline() # this mean we will read new line everytime in order to avoid repeatation.
print(len(lines))

# just blc our files are loaded sequentially so therefore we wont be able to see rest of the files data in atleast 121 iteration/elements. blc in the starting
# 121 elements we got data of file1. after that data of file2 starts. 
# So it's like all the files data is merged together in one list.  

121


In [23]:
# now we will iterate over the entire dataset that we created. i.e. all the files
# Compute the number of lines in the shakespeare dataset we created

shakespeare_dataset_iterator = iter(shakespare_dataset)  # here we r not using take() so our interator will iterate over the all files. 
lines = [line for line in shakespeare_dataset_iterator]
print(len(lines))


3134


In [46]:
files = next(iter(shakespare_dataset))
print(files)

allfile = []
a = iter(shakespare_dataset)

tf.Tensor(b'SCENE I. On a ship at sea: a tempestuous noise', shape=(), dtype=string)


<tf.Tensor: shape=(), dtype=string, numpy=b'SCENE I. On a ship at sea: a tempestuous noise'>

In [59]:
ex_line = next(a)   # next evertime prints a new line until last line is reached
print(ex_line.numpy())

b"Good, speak to the mariners: fall to't, yarely,"


#### Interleave lines from the text data files

In [60]:
# how can we overcome the problem of merging the files data. i.e. we want to do work on every file data seperately, for taht we use interleave function.

# Create a dataset of the text file strings

text_files_dataset = tf.data.Dataset.from_tensor_slices(text_files)
files = [file for file in text_files_dataset]
for file in files:
    print(file)

# these r the list of files we have in our text_file array.

tf.Tensor(b'/content/drive/My Drive/datasets/shakespeare/tempest.1.1.txt', shape=(), dtype=string)
tf.Tensor(b'/content/drive/My Drive/datasets/shakespeare/tempest.1.2.txt', shape=(), dtype=string)
tf.Tensor(b'/content/drive/My Drive/datasets/shakespeare/tempest.2.1.txt', shape=(), dtype=string)
tf.Tensor(b'/content/drive/My Drive/datasets/shakespeare/tempest.2.2.txt', shape=(), dtype=string)
tf.Tensor(b'/content/drive/My Drive/datasets/shakespeare/tempest.3.1.txt', shape=(), dtype=string)
tf.Tensor(b'/content/drive/My Drive/datasets/shakespeare/tempest.3.2.txt', shape=(), dtype=string)
tf.Tensor(b'/content/drive/My Drive/datasets/shakespeare/tempest.3.3.txt', shape=(), dtype=string)
tf.Tensor(b'/content/drive/My Drive/datasets/shakespeare/tempest.4.1.txt', shape=(), dtype=string)
tf.Tensor(b'/content/drive/My Drive/datasets/shakespeare/tempest.5.1.txt', shape=(), dtype=string)


In [61]:
# we can now use the interleave function to create a new dataset from the dataset of the text file so that interleave function takes mapping function which has to 
# convert element into a dataset in this case text line  dataset. 
# cycle_length is the length of datasets which we want to interleave. 
# Interleave the lines from the text files

interleaved_shakespare_dataset = text_files_dataset.interleave(tf.data.TextLineDataset, cycle_length=9)
print(interleaved_shakespare_dataset)


<InterleaveDataset shapes: (), types: tf.string>


In [66]:
# Print the first 10 elements of the interleaved dataset

lines = [line for line in iter(interleaved_shakespare_dataset.take(10))]
for line in lines:
    print(line)

# these are 10 lines from 9 different files. i.e. first 9 lines are the first line from every single file. and 10th line is the 2nd line of the first file,
# simlarly if we would have printed more lines then it would have printed 2nd line of the other files too.  

tf.Tensor(b'SCENE I. On a ship at sea: a tempestuous noise', shape=(), dtype=string)
tf.Tensor(b"SCENE II. The island. Before PROSPERO'S cell.", shape=(), dtype=string)
tf.Tensor(b'SCENE I. Another part of the island.', shape=(), dtype=string)
tf.Tensor(b'SCENE II. Another part of the island.', shape=(), dtype=string)
tf.Tensor(b"SCENE I. Before PROSPERO'S Cell.", shape=(), dtype=string)
tf.Tensor(b'SCENE II. Another part of the island.', shape=(), dtype=string)
tf.Tensor(b'SCENE III. Another part of the island.', shape=(), dtype=string)
tf.Tensor(b"SCENE I. Before PROSPERO'S cell.", shape=(), dtype=string)
tf.Tensor(b"SCENE I. Before PROSPERO'S cell.", shape=(), dtype=string)
tf.Tensor(b'of thunder and lightning heard.', shape=(), dtype=string)
