<a href="https://colab.research.google.com/github/gorogoro-uk/TensorFlow/blob/master/TensorFlow_Summary.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**TensorFlow Summary**

---



**Data Pre-processing**

In [None]:
# 1. tensorflow dataset
# eg MNIST
# use datasets directly in model.fit()

# 1.1 get data directly and use tf methods to split into test/train, data/label subsets
import tensorflow as tf
mnist_data = tf.keras.datasets.mnist
(x_train, y_train),(x_test, y_test) = mnist_data.load_data()


# 1.2 reshape train/test tensors
# standarise pixel values to range 0-1
import tensorflow as tf
mnist_data = tf.keras.datasets.mnist
(train_images, train_labels),(test_images, test_labels) = mnist_data.load_data()

train_images = train_images.reshape(60000, 28, 28, 1)
test_images = test_images.reshape(10000, 28, 28, 1)
train_images= train_images/255.0
test_images = test_images/255.0

In [None]:
# 2. image zip file sourced from internet with Image Data Generator
# eg happy/sad images

import os
from pathlib import Path
import urllib.request
import zipfile
from tensorflow.keras.preprocessing.image import ImageDataGenerator

# prepare directories
BASE = Path(os.getcwd()) / 'happysad'
HS_DATA = BASE / 'hs_data'
ZIP_DEST = BASE / 'happy-or-sad.zip'
ZIP_URL = "https://storage.googleapis.com/laurencemoroney-blog.appspot.com/happy-or-sad.zip"
if not os.path.exists(BASE):
    os.mkdir(BASE)
if not os.path.exists(HS_DATA):
    os.mkdir(HS_DATA)

# download data file & unzip
urllib.request.urlretrieve(ZIP_URL, ZIP_DEST)
zip_ref = zipfile.ZipFile(ZIP_DEST, 'r')
zip_ref.extractall(HS_DATA)
zip_ref.close()

# image data generator, flow from directory
# creates batches of images to feed to model
# label data is created automatically based on directory structure
# image data generator is passed to model.fit()
image_data_gen = ImageDataGenerator(rescale=1/255.0)

train_data_gen = image_data_gen.flow_from_directory(
    HS_DATA,
    target_size=(150, 150),
    batch_size=10,
    class_mode='binary')

In [None]:
# 3. image zip file sourced from internet with Image Data Generator
# manually create train/test split and move to directory
# eg cats/dogs

from pathlib import Path
import os
import urllib.request
import zipfile
import random
from shutil import copyfile
from tensorflow.keras.preprocessing.image import ImageDataGenerator

# prepare data directories
BASE = Path(os.getcwd()) / 'catdog'   # base directory
ZIP_DEST = BASE / 'cats_dogs.zip'     # zip file destination
CAT_SOURCE = BASE / 'PetImages/Cat'
DOG_SOURCE = BASE / 'PetImages/Dog'
TRAIN_DEST = BASE / 'train'           # training images
TEST_DEST = BASE / 'test'             # testing images
TRAIN_CAT = BASE / 'train/cat'        # cat training images
TEST_CAT = BASE / 'test/cat'          # cat testing images
TRAIN_DOG = BASE / 'train/dog'        # dog training images
TEST_DOG = BASE / 'test/dog'          # dog testing images
if not os.path.exists(BASE):
    os.mkdir(BASE)
if not os.path.exists(TRAIN_DEST):
    os.mkdir(TRAIN_DEST)
if not os.path.exists(TEST_DEST):
    os.mkdir(TEST_DEST)
if not os.path.exists(TRAIN_CAT):
    os.mkdir(TRAIN_CAT)
if not os.path.exists(TRAIN_DOG):
    os.mkdir(TRAIN_DOG)
if not os.path.exists(TEST_CAT):
    os.mkdir(TEST_CAT)
if not os.path.exists(TEST_DOG):
    os.mkdir(TEST_DOG)

# download & unzip data
URL = "https://download.microsoft.com/download/3/E/1/3E1C3F21-ECDB-4869-8368-6DEBA77B919F/kagglecatsanddogs_3367a.zip"
urllib.request.urlretrieve(URL, ZIP_DEST)
zip_ref = zipfile.ZipFile(ZIP_DEST, 'r')
zip_ref.extractall(BASE)
zip_ref.close()

# split data into train & test
TRAIN_SIZE = 0.90

def split_data(source, train, test, split):
    """ shuffle images, copy to directory, split into train/test """

    # list of image file names
    files = []
    for filename in os.listdir(source):
        file = source / filename
        if os.path.getsize(file) > 0:
            files.append(filename)
        else:
            print(filename + " is zero length, so ignoring.")

    # shuffle dataset images
    shuffled_set = random.sample(files, len(files))

    # define train, test split
    train_length = int(len(files) * split)
    test_length = int(len(files) - train_length)
    train_set = shuffled_set[0:train_length]
    test_set = shuffled_set[-test_length:]

    # move files to train or test directory
    for filename in train_set:
        this_file = source / filename
        destination = train / filename
        copyfile(this_file, destination)

    for filename in test_set:
        this_file = source / filename
        destination = test / filename
        copyfile(this_file, destination)

split_data(CAT_SOURCE, TRAIN_CAT, TEST_CAT, TRAIN_SIZE)
split_data(DOG_SOURCE, TRAIN_DOG, TEST_DOG, TRAIN_SIZE)

# image data generator, flow from directory
# data augmentation: rotate, shift, shear, zoom, flip
# augmentation not required on test images
# define batch, image size, create binary labels based on directory
# image data generator is passed to model.fit()
train_image_datagen = ImageDataGenerator(rescale=1./255,
                                          rotation_range=40,
                                          width_shift_range=0.2,
                                          height_shift_range=0.2,
                                          shear_range=0.2,
                                          zoom_range=0.2,
                                          horizontal_flip=True,
                                          fill_mode='nearest')
train_datagen = train_image_datagen.flow_from_directory(TRAIN_DEST,
                                                    batch_size=100,
                                                    class_mode='binary',
                                                    target_size=(150, 150))

test_image_datagen = ImageDataGenerator(rescale=1./255)
test_datagen = test_image_datagen.flow_from_directory(TEST_DEST,
                                                    batch_size=100,
                                                    class_mode='binary',
                                                    target_size=(150, 150))

In [None]:
# 4. image zip file sourced from internet with Image Data Generator
# separate train & test datasets
# eg. horse/human images

from pathlib import Path
import urllib.request
import os
import zipfile
from tensorflow.keras.preprocessing.image import ImageDataGenerator

#  prepare data directories
CWD = Path(os.getcwd())
BASE = CWD / 'horsehuman'
TRAIN = BASE / 'train'
TEST = BASE / 'test'
TRAIN_HORSE = TRAIN / 'horses'
TRAIN_HUMAN = TRAIN / 'humans'
TEST_HORSE = TEST / 'horses'
TEST_HUMAN = TEST / 'humans'
if not os.path.exists(BASE):
    os.mkdir(BASE)
if not os.path.exists(TRAIN):
    os.mkdir(TRAIN)
if not os.path.exists(TEST):
    os.mkdir(TEST)

# get train & test datasets
TRAIN_URL = "https://storage.googleapis.com/laurencemoroney-blog.appspot.com/horse-or-human.zip"
TEST_URL = "https://storage.googleapis.com/laurencemoroney-blog.appspot.com/validation-horse-or-human.zip"
ZIP_TRAIN = TRAIN / 'horse-or-human.zip'
ZIP_TEST = TEST / 'validation-horse-or-human.zip'

urllib.request.urlretrieve(TRAIN_URL,TRAIN / ZIP_TRAIN)
urllib.request.urlretrieve(TEST_URL,TEST / ZIP_TEST)

zip_ref = zipfile.ZipFile(ZIP_TRAIN, 'r')
zip_ref.extractall(TRAIN)
zip_ref.close()
zip_ref = zipfile.ZipFile(ZIP_TEST, 'r')
zip_ref.extractall(TEST)
zip_ref.close()

# image data generator, flow from directory
# data augmentation: rescale, rotate, shift, shear, zoom, flip
# define batch, image size, create binary labels based on directory
# image data generator is passed to model.fit()
# augmentation not required on test images
train_image_datagen = ImageDataGenerator(rescale=1./255,
                                          rotation_range=40,
                                          width_shift_range=0.2,
                                          height_shift_range=0.2,
                                          shear_range=0.2,
                                          zoom_range=0.2,
                                          horizontal_flip=True)

train_datagen = train_image_datagen.flow_from_directory(TRAIN,
                                                    batch_size=20,
                                                    class_mode='binary',
                                                    target_size=(150, 150))

test_image_datagen = ImageDataGenerator(rescale=1./255)

test_datagen = test_image_datagen.flow_from_directory(TEST,
                                                    batch_size=20,
                                                    class_mode='binary',
                                                    target_size=(150, 150))

In [None]:
# 5. csv file sourced from internet with Image Data Generator
# data & label extracted from csv file
# separate train & test files
# eg. sign MNIST images

import os
from pathlib import Path
import csv
import numpy as np
from tensorflow.keras.preprocessing.image import ImageDataGenerator

# prepare directories
CWD = Path(os.getcwd())
TRAIN_FILE = CWD / 'signlanguage/sign_mnist_train.csv'
TEST_FILE = CWD / 'signlanguage/sign_mnist_test.csv'

# read data from csv file, line by line
# image pixel values flattened in one row, reshape for use
def prepare_data(file_name):
    with open(file_name) as data_file:
        csv_reader = csv.reader(data_file, delimiter=',')
        first_line = True
        label_temp = []
        image_temp = []
        for row in csv_reader:
            if first_line:
                first_line = False
            else:
                label_temp.append(row[0])
                image_row = row[1:785]
                image_array = np.reshape(image_row, (28,28))
                image_temp.append(image_array)
    labels = np.array(label_temp).astype('float')
    images = np.array(image_temp).astype('float')
    return images, labels

x_train, y_train = prepare_data(TRAIN_FILE)
x_test, y_test = prepare_data(TEST_FILE)
x_train = np.expand_dims(x_train, axis=3)   # make [28, 28, 1]
x_test = np.expand_dims(x_test, axis=3)     # make [28, 28, 1]

# image data generator, flow
# data augmentation: rescale, rotate, shift, shear, zoom, flip
# define batch & data/label datsets
# image data generator is passed to model.fit()
# augmentation not required on test images
train_image_datagen = ImageDataGenerator(rescale=1/255.,
                                    rotation_range=40,
                                    width_shift_range=0.2,
                                    height_shift_range=0.2,
                                    shear_range=0.2,
                                    zoom_range=0.2,
                                    horizontal_flip=True,
                                    fill_mode='nearest')
train_datagen = train_image_datagen.flow(x_train, y_train, batch_size=32)

test_image_datagen = ImageDataGenerator(rescale=1/255.)
test_datagen = test_image_datagen.flow(x_test, y_test, batch_size=32)

In [None]:
# 6. csv file sourced from internet
# data & label extracted from csv file
# eg. BBC news texts

from pathlib import Path
import os
import csv
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# get dataset
BBC = Path(os.getcwd()) / 'bbc'
if not os.path.exists(BBC):
    os.mkdir(BBC)
BBC_FILE = BBC / 'bbc-text.csv'
BBC_URL = 'https://storage.googleapis.com/laurencemoroney-blog.appspot.com/bbc-text.csv'
urllib.request.urlretrieve(BBC_URL,BBC_FILE)

# read csv file: extract data & labels datasets
# reomve stopwords
# stopwords python list from external source 
sentences = []
labels = []
with open(BBC_FILE, 'r') as csvfile:
    reader = csv.reader(csvfile, delimiter=',')
    next(reader)
    for row in reader:
        labels.append(row[0])
        sentence = row[1]
        for word in stopwords:
            token = " " + word + " "
            sentence = sentence.replace(token, " ")
            sentence = sentence.replace("  ", " ")
        sentences.append(sentence)

# split into train/test
train_size = int(len(sentences) * train_ratio)
train_sentences = sentences[:train_size]
test_sentences = sentences[train_size:]
train_labels = labels[:train_size]
test_labels = labels[train_size:]

# tokenize train data
# create word_index, convert to sequences, pad
tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(train_sentences)
word_index = tokenizer.word_index
train_seq = tokenizer.texts_to_sequences(train_sentences)
train_pad = pad_sequences(train_seq, padding=padding_type, maxlen=max_length)

# tokenize test data with same tokenizer used on train data
test_seq = tokenizer.texts_to_sequences(test_sentences)
test_pad = pad_sequences(test_seq, padding=padding_type, maxlen=max_length)

# tokenize labels
label_tokenizer = Tokenizer()
label_tokenizer.fit_on_texts(labels)

# convert to Numpy arrays
train_label_seq = np.array(label_tokenizer.texts_to_sequences(train_labels))
test_label_seq = np.array(label_tokenizer.texts_to_sequences(test_labels))

In [None]:
# 7. csv file sourced from internet
# data & label extracted from csv file
# eg. social media texts

import os
from pathlib import Path
import urllib.request
import csv
import random
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# get dataset
CWD = Path(os.getcwd())
BASE = CWD / 'glove'
if not os.path.exists(BASE):
    os.mkdir(BASE)
DATA_FILE = BASE / 'training_cleaned.csv'
DATA_URL = 'https://storage.googleapis.com/laurencemoroney-blog.appspot.com/training_cleaned.csv'
urllib.request.urlretrieve(DATA_URL, DATA_FILE)

# read csv file: extract data & label
num_sent = 0
corpus = []
with open(DATA_FILE) as csvfile:
    reader = csv.reader(csvfile, delimiter=',')
    for row in reader:
        row_item = []
        row_item.append(row[5])
        row_label = row[0]
        if row_label == '0':
            row_item.append(0)
        else:
            row_item.append(1)
        num_sent = num_sent + 1
        corpus.append(row_item)

# shuffle and split into data/label
random.shuffle(corpus)
sentences=[]
labels=[]
for x in range(train_size):
    sentences.append(corpus[x][0])
    labels.append(corpus[x][1])

# tokenize data
# create word_index, convert to sequences, pad
train_tokenizer = Tokenizer()
train_tokenizer.fit_on_texts(sentences)
word_index = train_tokenizer.word_index
train_seqs = train_tokenizer.texts_to_sequences(sentences)
train_pad_seqs = pad_sequences(train_seqs, maxlen=max_length, padding=pad_type, truncating=trunc_type)

# split into train/test, convert to numpy arrays
split = int(test_ratio * train_size)
train_data = np.array(train_pad_seqs[split:train_size])
train_labels = np.array(labels[split:train_size])
test_data = np.array(train_pad_seqs[0:split])
test_labels = np.array(labels[0:split])

In [None]:
# 8. csv file sourced from internet
# data extracted from txt file
# create n-gram sequences & labels
# eg. Shakespeare's sonnets

import os
import urllib.request
from pathlib import Path
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow.keras.utils as ku
import numpy as np

# get data
BASE = Path(os.getcwd()) / 'shakespeare'
SONNET_FILE = 'shakespeare/sonnets.txt'
SONNET_URL = 'https://storage.googleapis.com/laurencemoroney-blog.appspot.com/sonnets.txt'
SHAKESPEARE_MODEL_FILE = 'shakespeare/shakespeare_model'
if not os.path.exists(BASE):
    os.mkdir(BASE)
urllib.request.urlretrieve(SONNET_URL,SONNET_FILE)


# tokenize data
# create world_list, sequences, pad, convert to numpy array
data = open(SONNET_FILE).read()
corpus = data.lower().split("\n")       # list of strings
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)
total_words = len(tokenizer.word_index) + 1  # oov token is extra

# create n-grams of all lengths from 1 to max line length
input_seqs = []
for line in corpus:
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_seq = token_list[:i+1]
        input_seqs.append(n_gram_seq)

max_seq_len = max([len(x) for x in input_seqs])
input_seqs = np.array(pad_sequences(input_seqs, maxlen=max_seq_len, padding='pre'))

# create predictors and label datasets
# label is last word in each n-gram sequence
predictors, label = input_seqs[:,:-1],input_seqs[:,-1]
label = ku.to_categorical(label, num_classes=total_words)    # one hot encoding

In [None]:
# 9. time series data made with functions
# built form components: base, trend, seasonality, noise

import numpy as np
import matplotlib.pyplot as plt
from tensorflow import keras

# plot time series
def plot_series(time, series, format="-", start=0, end=None):
    """plot a series"""
    plt.plot(time[start:end], series[start:end], format)
    plt.xlabel("Time")
    plt.ylabel("Value")
    plt.grid(True)

# linear trend
def trend(time, slope=0):
    """simple linear trend"""
    return slope * time

# recurring seasonal pattern
def seasonal_pattern(season_time):
    """arbitrary pattern per period"""
    return np.where(season_time < 0.1,
                    np.cos(season_time * 7 * np.pi),
                    1 / np.exp(5 * season_time))

# seasonality
def seasonality(time, period, amplitude=1, phase=0):
    """seasonal pattern over full dataset"""
    season_time = ((time + phase) % period) / period
    return amplitude * seasonal_pattern(season_time)

# random noise simulator
def noise(time, noise_level=1, seed=None):
    """random noise"""
    rnd = np.random.RandomState(seed)
    return rnd.randn(len(time)) * noise_level

# create timebase vector
time = np.arange(4 * 365 + 1, dtype="float32")

# constants
baseline = 10
amplitude = 40
slope = 0.01
noise_level = 2

# Create the series
series = baseline + trend(time, slope) + seasonality(time, period=365, amplitude=amplitude)
series += noise(time, noise_level, seed=42)

# create train/test datasets
split_time = 1100
time_train = time[:split_time]
x_train = series[:split_time]
time_valid = time[split_time:]
x_valid = series[split_time:]

In [None]:
# 10. Windowed Dataset for Time Series Data
# manufacture data & label datasets from time series
# sliding window of data to predict next value

# define windowed dataset
# crete datset with right dimensions
# define window size, window shift, stride, drop partial windows
# convert to arrays, create data & label portions
# shuffle, batch, prefetch
def windowed_dataset(series, window_size, batch_size, shuffle_buffer):
    series = tf.expand_dims(series, axis=-1)
    ds = tf.data.Dataset.from_tensor_slices(series)
    ds = ds.window(window_size + 1, shift=1, drop_remainder=True)
    ds = ds.flat_map(lambda w: w.batch(window_size + 1))
    ds = ds.shuffle(shuffle_buffer)
    ds = ds.map(lambda w: (w[:-1], w[1:]))
    return ds.batch(batch_size).prefetch(1)

# create windowed dataset
train_set = windowed_dataset(x_train, window_size, batch_size, shuffle_buffer_size)

# reference windowed dataset in model.fit()
sunspot_model_hist = sunspot_model.fit(train_set,
                                       epochs=100)