# Understanding Images Using Tensorflow 2.0 (beta)

https://www.tensorflow.org/beta/tutorials/load_data/images  
https://www.tensorflow.org/tutorials/load_data/tf_records

You will need these skills!  What is an image, tf.Example, serialized example etc.  
You won't get far with served models without this understanding.

## TensorFlow 2.0 Beta

### Go through UnderstandingTF_IO FIRST

### Do this Second, then UnderstandingExample.ipynb
This code  wants TF 2.0.   But, you can do the Example code with 1.14 (and eager execution)

### Then, go through UnderstandingImages

In [None]:
import os, pathlib
import random
import IPython.display as display

import matplotlib.pyplot as plt

# -- sorry -- this is confusing
#   DON'T load 2.0 if you are just getting data
#   only load 2.0 if you are working through THIS notebook completely
#   because - most of the software is not 2.0 compatible

# !pip install -q tensorflow==2.0.0-beta1
# you'll get a tensorflow-serving-api error

import tensorflow as tf

# cfa code utilities
from code.cfa_utils.tar_util import extract_tarball_directory

In [None]:
# you really need TensorFlow 2.0.x
tf.__version__

In [None]:
AUTOTUNE = tf.data.experimental.AUTOTUNE

In [None]:
# if you are using tf 1.14, you need to turn on eager execution - but you should be on 2.0
# adding it here mainly for reference, you can do most of the tf.Example tutorial in 1.14
tf.enable_eager_execution()

## Globals

you can get some test images from S3 - look for cfa_products / test_images  
There is also a test.tfrecord (that was created as a slice of the train/val split)

In [None]:
PROJECT_DIR = os.getcwd()

BUCKET = 'cfa-eadatasciencesb-sagemaker'

S3_ALL_IMAGES = "s3://{}/datasets/cfa_products/all_images/".format(BUCKET)
S3_ALL_ANNOTATIONS = "s3://{}/datasets/cfa_products/all_annotations".format(BUCKET)
S3_TEST_IMAGES = "s3://{}/datasets/cfa_products/test_images".format(BUCKET)

TARBALL_DIR = os.path.join(PROJECT_DIR, "data/tarballs")
TARBALL_EXTRACT = os.path.join(PROJECT_DIR, "data/tarball_extract")

IMAGE_DIR = os.path.join(PROJECT_DIR, "data/jpeg_images")
ANNOTATION_DIR = os.path.join(PROJECT_DIR, "data/annotations")

MODEL_PATH = os.path.join(PROJECT_DIR, "trained_model/export/Servo/1564778509")
LABEL_MAP = os.path.join(PROJECT_DIR, "code/cfa_prod_label_map.pbtxt")

# you can get data using the TrainModel_Step1_Local notebook
TEST_TFRECORDS_PATH =  os.path.join(PROJECT_DIR, "code/tfrecords/test/")
                                    
SAMPLE_IMAGE = os.path.join(PROJECT_DIR, "data/new_jpeg_images/20190710_variety_1562781002.jpg")


## Data - choice
0 - the data is already in IMAGE_DIR - go directly to summary  
1 - load IMAGER_DIR w/ Large Set of Training Data  
2 - load IMAGE_DIR w/ Test Set  

select one choice - then to go summary

In [None]:
# Choice 1
# Execute THIS block for TRAINING DATA

! mkdir -p /home/ec2-user/SageMaker/ssd-dag/data/tarball_extract
# TRAINING DATA - ALL IMAGES
# - delete tarballs first
# - delete images first - you can comment this out if you don't like it
! rm {TARBALL_DIR}/*.tar.gz -rf
! rm {IMAGE_DIR}/*.jpg -rf
! rm {ANNOTATION_DIR}/*.xml -rf

# get from 
! aws s3 cp {S3_ALL_IMAGES} {TARBALL_DIR} --recursive --quiet

jpg_ext = '.jpg'
extract_tarball_directory(TARBALL_DIR, TARBALL_EXTRACT, jpg_ext, IMAGE_DIR)

# TRAINING DATA - ALL ANNOTATIONS
# - delete tarballs first
# - delete annotations first - you can comment this out if you don't like it
! rm {TARBALL_DIR}/*.tar.gz -rf
! rm {ANNOTATIONS_DIR}/*.xml -rf

# get from 
! aws s3 cp {S3_ALL_ANNOTATIONS} {TARBALL_DIR} --recursive --quiet

xml_ext = '.xml'
extract_tarball_directory(TARBALL_DIR, TARBALL_EXTRACT, xml_ext, ANNOTATION_DIR)

# clean up
! rm {TARBALL_DIR}/*.tar.gz -rf
! ls {IMAGE_DIR} | wc
! ls {ANNOTATION_DIR} | wc

In [None]:
# choice 2
# Excecute THIS block for Test
! rm {IMAGE_DIR}/*.jpg -rf
! rm {ANNOTATIONS_DIR}/*.xml -rf

# test images are not tarballed
# get from S3
! aws s3 cp {S3_TEST_IMAGES} {IMAGE_DIR} --recursive --quiet

### After choosing a data set - RESUME HERE

In [None]:
# Summary
# your files are already present
# or you loaded a training set or you loaded a test set

# Traditional - Python way to create a director
data_root = pathlib.Path(IMAGE_DIR)

# omit .gitkeep here with a glob
trad_image_paths = list(data_root.glob('*.jpg'))
trad_image_paths = [str(path) for path in trad_image_paths]

# DON'T
# image_paths = tf.io.gfile.listdir(IMAGE_DIR)
# you'll get all files - including the .gitkeep file

# DO 
image_glob = os.path.join(IMAGE_DIR, '*.jpg')
tf_image_paths = tf.io.gfile.glob(image_glob)


print ("traditional:", len(trad_image_paths), '\n', trad_image_paths[:3])
print ("      tf.io:", len(tf_image_paths), '\n', tf_image_paths[:3])

# conclusion - not a lot of difference, 
#   but I would say master the tf.io functions - they will make things easier 
#   for you in the long run - tailored for this use case & less code

image_paths = tf_image_paths  #  just to keep the subsequent code working - use this common variable


In [None]:
random.shuffle(image_paths)

In [None]:
# use the PIL display function
#  to display from just a path

for n in range(3):
  image_path = random.choice(image_paths)
  display.display(display.Image(image_path))


## TensorFlow Utilities
the tutorial sez:  
##### TensorFlow includes all the tools you need to load and process images:

In [None]:
# Display an image
#   using PIL
img_path = image_paths[0]
print ("img_path:", img_path)
display.display(display.Image(img_path))

In [None]:
# read the image into a tensor
# - note
#     - with Eager on, the type is an EagerTensor
#     - without Eager, the type is Tensor
#
#   When you have a EagerTensor - you can get to the value easily
#     - numpy array but it is a serialized string of bytes
# 
#   you can get the value of the Tensor - in bytes, but then you have to decode it

img_tensor = tf.io.read_file(img_path)
print ("read_file:", type(img_tensor))
# EagerTensor prints the entire Tensor - because it's Eager - it's here, not lazy
# - but don't do it
# print ("   ", img_tensor)

print ("")
print(repr(img_tensor)[:100]+"...")

# tf.io.is_jpeg works with an EagerTensor
print (tf.io.is_jpeg(img_tensor))

img_numpy_bytes = img_tensor.numpy()
print (type(img_numpy_bytes))
print (img_numpy_bytes[:10])

In [None]:
# Decode the tensor (numpy)
# - note that an EagerTensor - the value can be printed
# - with Eager Execution, you don't have to run the session to get the value

print ("img_tensor - from read_file is type:", type(img_tensor), '\n', img_tensor.dtype)

# decode the string - byte array with decode_image()
# - now you have a tensor
img_tensor_numpy = tf.image.decode_image(img_tensor)
print ("deocde to a numpy Tensor:", type(img_tensor_numpy),  '\n', img_tensor_numpy.dtype)

img_numpy_uint8 = img_tensor_numpy.numpy()
print("img_numpy_uint8 type:", type(img_numpy_uint8))
print("    numpy shape:", img_numpy_uint8.shape)
print("    numpy type:", img_numpy_uint8.dtype)

In [None]:
# operations on the tensor
# -- resize - the Tensor must be byte array
print ("input(img_tensor_numpy):", type(img_tensor_numpy))
print ("      of dtype:", img_tensor_numpy.dtype)

# after resizing, you have a Tensor, decoded, float32, NOT normalized images
img_resized = tf.image.resize(img_tensor_numpy, [300, 300])
print ("tf.image.resize:", type(img_resized))
print ("    of dtype:", img_resized.dtype)
print ("      values:", img_resized.numpy()[:5,:5,:5], '\n')

# you can normalize the image
#   with a scalar-looking operation
img_resized_normal = img_resized/255.0
print("tensor type:", type(img_resized_normal))
print("tensor shape:", img_resized_normal.shape)
print("tensor range in values:", img_resized_normal.numpy().min(), img_resized_normal.numpy().max())
print("tensor data type:", img_resized_normal.dtype)

## Define Functions

In [None]:
# input:   Tensor, image :  string/byte array
# output:  Tensor, image :  float32 normalized
def preprocess_image(image):
  image = tf.image.decode_jpeg(image, channels=3)
  image = tf.image.resize(image, [192, 192])
  image /= 255.0  # normalize to [0,1] range

  return image

In [None]:
# input:  image path
# output: preprocess_image()
#         Tensor, image: float32 normalized
def load_and_preprocess_image(path):
  image = tf.io.read_file(path)
  return preprocess_image(image)

In [None]:
# test our functions

image_path = image_paths[0]

plt.imshow(load_and_preprocess_image(img_path))
plt.grid(False)
print()

In [None]:
# get length of dataset
# - at this time, there is no simple function
#   this is brute force - ii just want this for QA reasons

def get_dataset_length(ds):
    num_elements = 0
    for element in ds:
        num_elements += 1
    return num_elements

In [None]:
# sample Map Function
# input:  tf.Tensor
def map_fn(tensor):
    # - just trying some different functions
    # return_value = tf.strings.length(tensor)
    # return_value = tf.strings.substr(tensor,0,8)
    pattern = r'.*/jpeg_images/.*142\.jpg'
    return_value = tf.strings.regex_full_match(tensor, pattern)
    return return_value

In [None]:
# sample Filter Function
# takes an input
# must return boolan (True/False)
# - True, the record is kept
# - False, the record is dropped
def filter_fn(tensor):
    # keep ONLY /jpeg_images/*142.jpg:  
    # pattern = r'.*/jpeg_images/.*142\.jpg'

    # filter OUT any in the *2SB* group
    # -- Error - and I never figured this out
    #   * it is a perl error ?
    #   doesn't like the negation (?! )
    # pattern = r'^.*/jpeg_images/(?!.*2SB).*\.jpg'
    
    # keep ONLY 2SB
    pattern = r'^.*/jpeg_images/.*2SB.*\.jpg'
    match = tf.strings.regex_full_match(tensor, pattern)  # returns a Tensor type Boolean
    # filter out matches -
    # - to filter out - return False - but it matched True
    # if match.numpy():
    #    return False
    # else:
    #    return True
    return match

In [None]:
# print out the contents of a Dataset
def print_dataset_contents(ds):
    for t in ds:
        print (t.numpy())

## DataSets

https://www.tensorflow.org/api_docs/python/tf/data/Dataset

magic happens with a Dataset.  
You can create a dataset from a list of file paths.

You can apply a function to to each record - .map() operation


In [None]:
# Take the array of strings -- image paths
# and make a dataset
image_path_ds = tf.data.Dataset.from_tensor_slices(image_paths)
print ("image path dataset type:", type(image_path_ds))

#note - strings, no size
print ("image path dataset:", image_path_ds)

In [None]:
# WARNING - some deprecation risk 
print ("Dataset output class:", tf.compat.v1.data.get_output_classes(image_path_ds))
print ("Dataset output class shape:", tf.compat.v1.data.get_output_shapes(image_path_ds))

In [None]:
# cache the dataset to memory
image_path_ds.cache()

In [None]:
# take n from a dataset
print ("Full Dataset:", get_dataset_length(image_path_ds))
image_path_subset_ds = image_path_ds.take(5)
print ("Subset Dataset:", get_dataset_length(image_path_subset_ds))

### Iterate on a Dataset
each element is a EagerTensor - assuming you have Eager Execution enabled

This Dataset is still just strings (Tensors containing strings).  So any operations must support type = Tensor.   See:  
https://www.tensorflow.org/api_docs/python/tf/strings

In [None]:
# you can decode every Tensor
for t in image_path_subset_ds:
    image_path = t.numpy().decode()
    print (image_path)

### Map on a DataSet
Your function must take Tensor input & output.  This dataset is a string so look at tf.strings functions.    You can't just just a plain python string function.

In [None]:
# use a map function == map_fn
#  must operate & return a Tensor
print_dataset_contents(image_path_subset_ds.map(map_fn))


### Filter a DataSet
this dataset is a string - so, use a regex to do basic filter operations.   

You can't pass parameters into your function - so all logic has to be inside the filter_fn.

Below:  make sure you understand the RegEx, then incorporate it into the filter_function

In [None]:
# filter using a RegE\
import re

pattern = r'.*/jpeg_images/.*142\.jpg'
# pattern = r'^.*/jpeg_images/(?!.*2SB).*\.jpg'
sample1_text = '/home/ec2-user/SageMaker/ssd-dag/data/jpeg_images/20190531_2SB_1559319142.jpg'
sample2_text = '/home/ec2-user/SageMaker/ssd-dag/data/jpeg_images/20190603_3MC_1559598324.jpg'

sample_list = [sample1_text, sample2_text]

for txt in sample_list:
    matchObject = re.match(pattern, txt)
    if matchObject:
        print ("Found:", matchObject.group())
    else:
        print ("NOT Found")

In [None]:
result = re.match(r"(?!.*/jpeg_images/.*412).*\.jpg", "/home/jpeg_images/4324721234.jpg")

if result:
    print ("True")

In [None]:
ds_filtered = image_path_subset_ds.filter(filter_fn)
print_dataset_contents(ds_filtered)

In [None]:
# Using dataset map()
# you can map the dataset - super cool!
# - use the functons we defined earlier
# - looks like it will parallelize automatically!

image_ds = image_path_ds.map(load_and_preprocess_image, num_parallel_calls=AUTOTUNE)
print ("image dataset type:", type(image_path_ds))  # same type
print ("image dataset:", image_path_ds)             # serialized numpy array of the normalized image

In [None]:
# you can pull the value out by iterating on the Dataset

for n, image in enumerate(image_ds.take(4)):
  plt.figure(figsize=(8,8))
  plt.subplot(2,2,n+1)
  plt.imshow(image)
  plt.grid(False)
  plt.xticks([])
  plt.yticks([])
  plt.show()