# Chapter 14: Deep Computer Vision Using Convolutional Neural Networks

Note: CNNs are not restricted to visual perception tasks and are useful in voice recognition and NLP tasks.

In this chapter:
 - CNN theory and building blocks
 - implementation with TF and Keras
 - tasks like object detection and semantic segmentation

In [1]:
from sklearn.datasets import load_sample_images
import tensorflow as tf

# Loading images, rescaling etc
images = load_sample_images()["images"]
images = tf.keras.layers.CenterCrop(height=70, width=120)(images)
images = tf.keras.layers.Rescaling(scale=1 / 255)(images)

In [3]:
# 2 images, 70x120 (bc of rescaling above), 3 - RGB (color channels)
images.shape

TensorShape([2, 70, 120, 3])

In [4]:
conv_layer = tf.keras.layers.Conv2D(filters=32, kernel_size=7)
# Apply convolution to the images
fmaps = conv_layer(images)

In [7]:
# Same two images, applied with conv2D, no padding

# New dims = 2 images, 64x114 after conv2D, 32 is feature intensity
fmaps.shape

TensorShape([2, 64, 114, 32])

In [9]:
# Padding = 'valid' means no zero padding, only 'valid' positions

# Same conv_layer but with zero-padding
conv_layer = tf.keras.layers.Conv2D(filters=32, kernel_size=7,
                                    padding="same")

fmaps = conv_layer(images)
fmaps.shape

TensorShape([2, 70, 120, 32])

In [10]:
# If stride > 1, output size != input size

# convolutional layers hold weights and biases as usual
kernels, biases = conv_layer.get_weights()

kernels.shape #  [kernel_height, kernel_width, input_channels, output_channels]

biases.shape # [output_channels]

# number of output channels == number output feature maps

(32,)

In [11]:
max_pool = tf.keras.layers.MaxPool2D(pool_size=2)

In [12]:
class DepthPool(tf.keras.layers.Layer):
    def __init__(self, pool_size=2, **kwargs):
        super().__init__(**kwargs)
        self.pool_size = pool_size

    def call(self, inputs):
        shape = tf.shape(inputs)  # shape[-1] is the number of channels
        groups = shape[-1] // self.pool_size  # number of channel groups
        new_shape = tf.concat([shape[:-1], [groups, self.pool_size]], axis=0)
        return tf.reduce_max(tf.reshape(inputs, new_shape), axis=-1)

In [13]:
global_avg_pool = tf.keras.layers.GlobalAvgPool2D()

In [14]:
global_avg_pool(images)

<tf.Tensor: shape=(2, 3), dtype=float32, numpy=
array([[0.64338624, 0.5971759 , 0.5824972 ],
       [0.76306933, 0.26011038, 0.10849128]], dtype=float32)>

In [15]:
from functools import partial

DefaultConv2D = partial(tf.keras.layers.Conv2D, kernel_size=3, padding="same",
                        activation="relu", kernel_initializer="he_normal")
model = tf.keras.Sequential([
    DefaultConv2D(filters=64, kernel_size=7, input_shape=[28, 28, 1]),
    tf.keras.layers.MaxPool2D(),
    DefaultConv2D(filters=128),
    DefaultConv2D(filters=128),
    tf.keras.layers.MaxPool2D(),
    DefaultConv2D(filters=256),
    DefaultConv2D(filters=256),
    tf.keras.layers.MaxPool2D(),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(units=128, activation="relu",
                          kernel_initializer="he_normal"),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(units=64, activation="relu",
                          kernel_initializer="he_normal"),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(units=10, activation="softmax")
])