<a href="https://colab.research.google.com/github/gorogoro-uk/TensorFlow/blob/master/TensorFlow_Summary.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**TensorFlow Summary**

---



**Data Pre-processing**

In [None]:
# 1. tensorflow dataset
# eg MNIST
# use datasets directly in model.fit()

# 1.1 get data directly and use tf methods to split into test/train, data/label subsets
import tensorflow as tf
mnist_data = tf.keras.datasets.mnist
(x_train, y_train),(x_test, y_test) = mnist_data.load_data()


# 1.2 reshape train/test tensors
# standarise pixel values to range 0-1
import tensorflow as tf
mnist_data = tf.keras.datasets.mnist
(train_images, train_labels),(test_images, test_labels) = mnist_data.load_data()

train_images = train_images.reshape(60000, 28, 28, 1)
test_images = test_images.reshape(10000, 28, 28, 1)
train_images= train_images/255.0
test_images = test_images/255.0

In [None]:
# 2. image zip file sourced from internet with Image Data Generator
# eg happy/sad images

import os
from pathlib import Path
import urllib.request
import zipfile
from tensorflow.keras.preprocessing.image import ImageDataGenerator

# prepare directories
BASE = Path(os.getcwd()) / 'happysad'
HS_DATA = BASE / 'hs_data'
ZIP_DEST = BASE / 'happy-or-sad.zip'
ZIP_URL = "https://storage.googleapis.com/laurencemoroney-blog.appspot.com/happy-or-sad.zip"

if not os.path.exists(BASE):
    os.mkdir(BASE)
if not os.path.exists(HS_DATA):
    os.mkdir(HS_DATA)

# download data file & unzip
urllib.request.urlretrieve(ZIP_URL, ZIP_DEST)
zip_ref = zipfile.ZipFile(ZIP_DEST, 'r')
zip_ref.extractall(HS_DATA)
zip_ref.close()

# image data generator, flow from directory
# creates batches of images to feed to model
# label data is created automatically based on directory structure
# image data generator is passed to model.fit()
image_data_gen = ImageDataGenerator(rescale=1/255.0)

train_data_gen = image_data_gen.flow_from_directory(
    HS_DATA,
    target_size=(150, 150),
    batch_size=10,
    class_mode='binary'
)

In [None]:
# 3. image data zip file
# manually create train/test split and move to directory
# image 
# eg cats/dogs

from pathlib import Path
import os
import urllib.request
import zipfile
import random
from shutil import copyfile
from tensorflow.keras.preprocessing.image import ImageDataGenerator


# prepare data directories
BASE = Path(os.getcwd()) / 'catdog'   # base directory
ZIP_DEST = BASE / 'cats_dogs.zip'     # zip file destination
CAT_SOURCE = BASE / 'PetImages/Cat'
DOG_SOURCE = BASE / 'PetImages/Dog'
TRAIN_DEST = BASE / 'train'           # training images
TEST_DEST = BASE / 'test'             # testing images
TRAIN_CAT = BASE / 'train/cat'        # cat training images
TEST_CAT = BASE / 'test/cat'          # cat testing images
TRAIN_DOG = BASE / 'train/dog'        # dog training images
TEST_DOG = BASE / 'test/dog'          # dog testing images

# create directories
if not os.path.exists(BASE):
    os.mkdir(BASE)
if not os.path.exists(TRAIN_DEST):
    os.mkdir(TRAIN_DEST)
if not os.path.exists(TEST_DEST):
    os.mkdir(TEST_DEST)
if not os.path.exists(TRAIN_CAT):
    os.mkdir(TRAIN_CAT)
if not os.path.exists(TRAIN_DOG):
    os.mkdir(TRAIN_DOG)
if not os.path.exists(TEST_CAT):
    os.mkdir(TEST_CAT)
if not os.path.exists(TEST_DOG):
    os.mkdir(TEST_DOG)

# download & unzip data
URL = "https://download.microsoft.com/download/3/E/1/3E1C3F21-ECDB-4869-8368-6DEBA77B919F/kagglecatsanddogs_3367a.zip"
urllib.request.urlretrieve(URL, ZIP_DEST)
zip_ref = zipfile.ZipFile(ZIP_DEST, 'r')
zip_ref.extractall(BASE)
zip_ref.close()

# split data into train & test
TRAIN_SIZE = 0.90

def split_data(source, train, test, split):
    """ shuffle images, copy to directory, split into train/test """

    # list of image file names
    files = []
    for filename in os.listdir(source):
        file = source / filename
        if os.path.getsize(file) > 0:
            files.append(filename)
        else:
            print(filename + " is zero length, so ignoring.")

    train_length = int(len(files) * split)
    test_length = int(len(files) - train_length)

    # shuffle dataset images
    shuffled_set = random.sample(files, len(files))

    # define train, test split
    train_set = shuffled_set[0:train_length]
    test_set = shuffled_set[-test_length:]

    # move files to train or test directory
    for filename in train_set:
        this_file = source / filename
        destination = train / filename
        copyfile(this_file, destination)

    for filename in test_set:
        this_file = source / filename
        destination = test / filename
        copyfile(this_file, destination)

split_data(CAT_SOURCE, TRAIN_CAT, TEST_CAT, TRAIN_SIZE)
split_data(DOG_SOURCE, TRAIN_DOG, TEST_DOG, TRAIN_SIZE)

# image data generator, flow from directory
# data augmentation: rescale, rotate, shift, shear, zoom, flip
# define batch, image size, create binary labels based on directory
# image data generator is passed to model.fit()
train_image_datagen = ImageDataGenerator(rescale=1./255,
                                          rotation_range=40,
                                          width_shift_range=0.2,
                                          height_shift_range=0.2,
                                          shear_range=0.2,
                                          zoom_range=0.2,
                                          horizontal_flip=True,
                                          fill_mode='nearest')
train_datagen = train_image_datagen.flow_from_directory(TRAIN_DEST,
                                                    batch_size=100,
                                                    class_mode='binary',
                                                    target_size=(150, 150))

test_image_datagen = ImageDataGenerator(rescale=1./255,
                                          rotation_range=40,
                                          width_shift_range=0.2,
                                          height_shift_range=0.2,
                                          shear_range=0.2,
                                          zoom_range=0.2,
                                          horizontal_flip=True,
                                          fill_mode='nearest')
test_datagen = test_image_datagen.flow_from_directory(TEST_DEST,
                                                    batch_size=100,
                                                    class_mode='binary',
                                                    target_size=(150, 150))