In [1]:
%matplotlib inline

In [148]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from skimage.transform import resize

import tensorflow as tf

from tensorflow.keras.datasets import cifar10

from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Input, Dense, Flatten, Conv2D, Conv2DTranspose, MaxPool2D, GlobalAvgPool2D, Rescaling, Add, UpSampling2D

from tensorflow.keras.applications import vgg19, resnet50, inception_resnet_v2

from tensorflow.keras.utils import to_categorical

# Neural Networks for Images

## __Convolutional neural networks__

Convolution - image _I_ and filter _F_, R = I ⊛ 𝐹

For each pixel (𝑖, 𝑗), $ 𝑅_{𝑖𝑗} = ∑ 𝐼_{𝑖𝑗} ∗ 𝐹 $

The result is different depending on the filter. 𝐹 is usually square, with odd rank (so that it has a central pixel) e.g 3, 5, 7, 9, 11.

stride - step for applying the filter

_Padding_

Valid convolution - no padding.

Same convolution - with padding to obgtain an image with same size of the input image. Thus, we will not reduce the image size.
So if the input is 7x7, we add padding to 9x9, and thus, after applying the filter we have again 7x7.

p = f - 1 / 2, so if f is 3 (3x3), the padding should be 1, in order to keep the output size unchanged.

_Output image dimentions_ (activation size)


$$ \left\lfloor \frac{n + 2p - f}{s} + 1 \right\rfloor \times \left\lfloor \frac{n + 2p - f}{s} + 1 \right\rfloor $$

If the image has many channels with dimensions 𝑛 × 𝑛 × 𝑐, just use an 𝑓 × 𝑓 × 𝑐 filter

▪ Apply each filter to each channel, sum all numbers

▪ Result: 2D image

_What happens if we have many filters?_ We apply each filter separately and produce a 2D image. Afterwards, we stack them (they are independent) and thus obtain a 3D volume (convolutional volume).

__Convolution layer__

![convolution layer](convolution_layer.png)

Convolution layers are invariant to translation (when the object is located on different places in the image).

__Demo: cifar10__

In [12]:
# cifar10.load_data()

Downloading data from https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz


((array([[[[ 59,  62,  63],
           [ 43,  46,  45],
           [ 50,  48,  43],
           ...,
           [158, 132, 108],
           [152, 125, 102],
           [148, 124, 103]],
  
          [[ 16,  20,  20],
           [  0,   0,   0],
           [ 18,   8,   0],
           ...,
           [123,  88,  55],
           [119,  83,  50],
           [122,  87,  57]],
  
          [[ 25,  24,  21],
           [ 16,   7,   0],
           [ 49,  27,   8],
           ...,
           [118,  84,  50],
           [120,  84,  50],
           [109,  73,  42]],
  
          ...,
  
          [[208, 170,  96],
           [201, 153,  34],
           [198, 161,  26],
           ...,
           [160, 133,  70],
           [ 56,  31,   7],
           [ 53,  34,  20]],
  
          [[180, 139,  96],
           [173, 123,  42],
           [186, 144,  30],
           ...,
           [184, 148,  94],
           [ 97,  62,  34],
           [ 83,  53,  34]],
  
          [[177, 144, 116],
           [16

In [13]:
(X_train, Y_train), (X_test, Y_test) = cifar10.load_data()

In [14]:
X_train.shape, Y_train.shape, X_test.shape, Y_test.shape

((50000, 32, 32, 3), (50000, 1), (10000, 32, 32, 3), (10000, 1))

In [16]:
X_train[0]

array([[[ 59,  62,  63],
        [ 43,  46,  45],
        [ 50,  48,  43],
        ...,
        [158, 132, 108],
        [152, 125, 102],
        [148, 124, 103]],

       [[ 16,  20,  20],
        [  0,   0,   0],
        [ 18,   8,   0],
        ...,
        [123,  88,  55],
        [119,  83,  50],
        [122,  87,  57]],

       [[ 25,  24,  21],
        [ 16,   7,   0],
        [ 49,  27,   8],
        ...,
        [118,  84,  50],
        [120,  84,  50],
        [109,  73,  42]],

       ...,

       [[208, 170,  96],
        [201, 153,  34],
        [198, 161,  26],
        ...,
        [160, 133,  70],
        [ 56,  31,   7],
        [ 53,  34,  20]],

       [[180, 139,  96],
        [173, 123,  42],
        [186, 144,  30],
        ...,
        [184, 148,  94],
        [ 97,  62,  34],
        [ 83,  53,  34]],

       [[177, 144, 116],
        [168, 129,  94],
        [179, 142,  87],
        ...,
        [216, 184, 140],
        [151, 118,  84],
        [123,  92,  72]]

In [17]:
Y_train[0]

array([6], dtype=uint8)

In [23]:
cnn_model = Sequential([
    Input((32, 32, 3)),
    Conv2D(filters = 17, kernel_size = (3, 3), activation = "relu", padding = "same"),
    Conv2D(filters = 17, kernel_size = (3, 3), activation = "relu", padding = "same"),
    Conv2D(filters = 12, kernel_size = (3, 3), activation = "relu", padding = "same"),
    Conv2D(filters = 12, kernel_size = (3, 3), activation = "relu", padding = "same"),
    Conv2D(filters = 10, kernel_size = (3, 3), activation = "relu", padding = "same"),
    Conv2D(filters = 10, kernel_size = (3, 3), activation = "relu", padding = "same"),
    Conv2D(filters = 5, kernel_size = (3, 3), activation = "relu", padding = "same"),
    Conv2D(filters = 5, kernel_size = (3, 3), activation = "relu", padding = "same"),
])

In [24]:
cnn_model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d_2 (Conv2D)           (None, 32, 32, 17)        476       
                                                                 
 conv2d_3 (Conv2D)           (None, 32, 32, 12)        1848      
                                                                 
 conv2d_4 (Conv2D)           (None, 32, 32, 10)        1090      
                                                                 
 conv2d_5 (Conv2D)           (None, 32, 32, 5)         455       
                                                                 
Total params: 3869 (15.11 KB)
Trainable params: 3869 (15.11 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


The result of the layer is activation or feature map.

In [29]:
dense_model = Sequential([
    Input((32, 32, 5)),
    Flatten(),
    Dense(20, activation = "relu"),
    Dense(20, activation = "relu"),
    Dense(20, activation = "relu"),
    Dense(10, activation = "softmax")
])

In [30]:
dense_model.summary()

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 flatten (Flatten)           (None, 5120)              0         
                                                                 
 dense_3 (Dense)             (None, 20)                102420    
                                                                 
 dense_4 (Dense)             (None, 10)                210       
                                                                 
Total params: 102630 (400.90 KB)
Trainable params: 102630 (400.90 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


A combination of two models is one model.

In [35]:
combined_cnn_model = Sequential([
    Input((32, 32, 3)),
    Conv2D(filters = 17, kernel_size = (3, 3), activation = "relu", padding = "same"),
    Conv2D(filters = 17, kernel_size = (3, 3), activation = "relu", padding = "same"),
    Conv2D(filters = 12, kernel_size = (3, 3), activation = "relu", padding = "same"),
    Conv2D(filters = 12, kernel_size = (3, 3), activation = "relu", padding = "same"),
    Conv2D(filters = 10, kernel_size = (3, 3), activation = "relu", padding = "same"),
    Conv2D(filters = 10, kernel_size = (3, 3), activation = "relu", padding = "same"),
    Conv2D(filters = 5, kernel_size = (3, 3), activation = "relu", padding = "same"),
    Conv2D(filters = 5, kernel_size = (3, 3), activation = "relu", padding = "same"),
    
    Flatten(),
    Dense(40, activation = "relu"),
    Dense(20, activation = "relu"),
    Dense(10, activation = "softmax")  
])

In [36]:
combined_cnn_model.summary()

Model: "sequential_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d_14 (Conv2D)          (None, 32, 32, 17)        476       
                                                                 
 conv2d_15 (Conv2D)          (None, 32, 32, 17)        2618      
                                                                 
 conv2d_16 (Conv2D)          (None, 32, 32, 12)        1848      
                                                                 
 conv2d_17 (Conv2D)          (None, 32, 32, 12)        1308      
                                                                 
 conv2d_18 (Conv2D)          (None, 32, 32, 10)        1090      
                                                                 
 conv2d_19 (Conv2D)          (None, 32, 32, 10)        910       
                                                                 
 conv2d_20 (Conv2D)          (None, 32, 32, 5)        

Pooling - for dimensionality reduction. Max pooling is freauently used. Image 8x8 -> 4x4.

In [39]:
combined_cnn_model = Sequential([
    Input((32, 32, 3)),
    Conv2D(filters = 17, kernel_size = (3, 3), activation = "relu", padding = "same"),
    Conv2D(filters = 17, kernel_size = (3, 3), activation = "relu", padding = "same"),
    Conv2D(filters = 12, kernel_size = (3, 3), activation = "relu", padding = "same"),
    Conv2D(filters = 12, kernel_size = (3, 3), activation = "relu", padding = "same"),
    MaxPool2D(),
    Conv2D(filters = 10, kernel_size = (3, 3), activation = "relu", padding = "same"),
    Conv2D(filters = 10, kernel_size = (3, 3), activation = "relu", padding = "same"),
    Conv2D(filters = 5, kernel_size = (3, 3), activation = "relu", padding = "same"),
    Conv2D(filters = 5, kernel_size = (3, 3), activation = "relu", padding = "same"),
    
    Flatten(),
    Dense(40, activation = "relu"),
    Dense(20, activation = "relu"),
    Dense(10, activation = "softmax")  
])

In [40]:
combined_cnn_model.summary()

Model: "sequential_9"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d_30 (Conv2D)          (None, 32, 32, 17)        476       
                                                                 
 conv2d_31 (Conv2D)          (None, 32, 32, 17)        2618      
                                                                 
 conv2d_32 (Conv2D)          (None, 32, 32, 12)        1848      
                                                                 
 conv2d_33 (Conv2D)          (None, 32, 32, 12)        1308      
                                                                 
 max_pooling2d_1 (MaxPoolin  (None, 16, 16, 12)        0         
 g2D)                                                            
                                                                 
 conv2d_34 (Conv2D)          (None, 16, 16, 10)        1090      
                                                      

Thus, the parameters of the first dense layer were reduced from 200k to 50k.

In [41]:
combined_cnn_model = Sequential([
    Input((32, 32, 3)),
    Conv2D(filters = 19, kernel_size = (3, 3), activation = "relu", padding = "same"),
    Conv2D(filters = 19, kernel_size = (3, 3), activation = "relu", padding = "same"),
    Conv2D(filters = 17, kernel_size = (3, 3), activation = "relu", padding = "same"),
    Conv2D(filters = 17, kernel_size = (3, 3), activation = "relu", padding = "same"),
    MaxPool2D(),
    Conv2D(filters = 12, kernel_size = (3, 3), activation = "relu", padding = "same"),
    Conv2D(filters = 12, kernel_size = (3, 3), activation = "relu", padding = "same"),
    Conv2D(filters = 10, kernel_size = (3, 3), activation = "relu", padding = "same"),
    Conv2D(filters = 10, kernel_size = (3, 3), activation = "relu", padding = "same"),
    MaxPool2D(),
    Conv2D(filters = 5, kernel_size = (3, 3), activation = "relu", padding = "same"),
    Conv2D(filters = 5, kernel_size = (3, 3), activation = "relu", padding = "same"),
    
    Flatten(),
    Dense(40, activation = "relu"),
    Dense(20, activation = "relu"),
    Dense(10, activation = "softmax")  
])

In [42]:
combined_cnn_model.summary()

Model: "sequential_10"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d_38 (Conv2D)          (None, 32, 32, 19)        532       
                                                                 
 conv2d_39 (Conv2D)          (None, 32, 32, 19)        3268      
                                                                 
 conv2d_40 (Conv2D)          (None, 32, 32, 17)        2924      
                                                                 
 conv2d_41 (Conv2D)          (None, 32, 32, 17)        2618      
                                                                 
 max_pooling2d_2 (MaxPoolin  (None, 16, 16, 17)        0         
 g2D)                                                            
                                                                 
 conv2d_42 (Conv2D)          (None, 16, 16, 12)        1848      
                                                     

Global pooling - dimensionality reduction by channels

In [45]:
combined_cnn_model = Sequential([
    Input((32, 32, 3)),
    Conv2D(filters = 19, kernel_size = (3, 3), activation = "relu", padding = "same"),
    Conv2D(filters = 19, kernel_size = (3, 3), activation = "relu", padding = "same"),
    Conv2D(filters = 17, kernel_size = (3, 3), activation = "relu", padding = "same"),
    Conv2D(filters = 17, kernel_size = (3, 3), activation = "relu", padding = "same"),
    MaxPool2D(),
    Conv2D(filters = 12, kernel_size = (3, 3), activation = "relu", padding = "same"),
    Conv2D(filters = 12, kernel_size = (3, 3), activation = "relu", padding = "same"),
    Conv2D(filters = 10, kernel_size = (3, 3), activation = "relu", padding = "same"),
    Conv2D(filters = 10, kernel_size = (3, 3), activation = "relu", padding = "same"),
    MaxPool2D(),
    Conv2D(filters = 5, kernel_size = (3, 3), activation = "relu", padding = "same"),
    Conv2D(filters = 5, kernel_size = (3, 3), activation = "relu", padding = "same"),
    GlobalAvgPool2D(),
    
    # Flatten(),
    Dense(40, activation = "relu"),
    Dense(20, activation = "relu"),
    Dense(10, activation = "softmax")  
])

In [46]:
combined_cnn_model.summary()

Model: "sequential_11"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d_48 (Conv2D)          (None, 32, 32, 19)        532       
                                                                 
 conv2d_49 (Conv2D)          (None, 32, 32, 19)        3268      
                                                                 
 conv2d_50 (Conv2D)          (None, 32, 32, 17)        2924      
                                                                 
 conv2d_51 (Conv2D)          (None, 32, 32, 17)        2618      
                                                                 
 max_pooling2d_4 (MaxPoolin  (None, 16, 16, 17)        0         
 g2D)                                                            
                                                                 
 conv2d_52 (Conv2D)          (None, 16, 16, 12)        1848      
                                                     

If we have more elements, the pooling is more appropriate, otherwise we use Flatten.

In [47]:
combined_cnn_model.compile(loss = "sparse_categorical_crossentropy", optimizer = "adam")

In [48]:
combined_cnn_model.fit(X_train, Y_train)



<keras.src.callbacks.History at 0x2f326bcdf10>

For bigger dimensionality reduction, dilated convolutions could be used. They cover greater area of the image and there are overlapped areas. 

__VGG-19__

In [51]:
vgg_model = vgg19.VGG19()

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/vgg19/vgg19_weights_tf_dim_ordering_tf_kernels.h5


In [52]:
vgg_model.summary()

Model: "vgg19"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_13 (InputLayer)       [(None, 224, 224, 3)]     0         
                                                                 
 block1_conv1 (Conv2D)       (None, 224, 224, 64)      1792      
                                                                 
 block1_conv2 (Conv2D)       (None, 224, 224, 64)      36928     
                                                                 
 block1_pool (MaxPooling2D)  (None, 112, 112, 64)      0         
                                                                 
 block2_conv1 (Conv2D)       (None, 112, 112, 128)     73856     
                                                                 
 block2_conv2 (Conv2D)       (None, 112, 112, 128)     147584    
                                                                 
 block2_pool (MaxPooling2D)  (None, 56, 56, 128)       0     

In [64]:
vgg_model.layers[12].weights

[<tf.Variable 'block4_conv1/kernel:0' shape=(3, 3, 256, 512) dtype=float32, numpy=
 array([[[[ 2.58524120e-02, -1.38346152e-03, -1.28712133e-02, ...,
            9.87619133e-05, -8.83383490e-03,  1.52001427e-02],
          [-6.59053866e-03, -3.86062381e-03,  3.73427582e-04, ...,
           -3.60257644e-03, -7.42938137e-03, -3.18204402e-05],
          [-1.53181388e-03, -5.74517436e-03,  7.80890870e-04, ...,
           -2.33465689e-03, -5.37212007e-03,  3.34833376e-03],
          ...,
          [ 1.99864022e-02, -4.72304597e-03, -5.95615292e-03, ...,
            2.24462058e-03, -1.52456667e-03,  3.91305285e-03],
          [-1.01398793e-03,  9.53671616e-03,  1.46581251e-02, ...,
            7.65887601e-03,  1.41440369e-02, -1.71044245e-04],
          [-1.04687642e-02, -8.56693648e-03,  3.51992506e-03, ...,
           -3.08121531e-03, -6.91359676e-03, -1.19565260e-02]],
 
         [[-3.10214609e-03, -1.74580247e-03,  4.19515837e-03, ...,
           -1.31797250e-02, -9.71959066e-03,  1.8878

In [55]:
vgg_model.layers[12].kernel.numpy()

array([[[[ 2.58524120e-02, -1.38346152e-03, -1.28712133e-02, ...,
           9.87619133e-05, -8.83383490e-03,  1.52001427e-02],
         [-6.59053866e-03, -3.86062381e-03,  3.73427582e-04, ...,
          -3.60257644e-03, -7.42938137e-03, -3.18204402e-05],
         [-1.53181388e-03, -5.74517436e-03,  7.80890870e-04, ...,
          -2.33465689e-03, -5.37212007e-03,  3.34833376e-03],
         ...,
         [ 1.99864022e-02, -4.72304597e-03, -5.95615292e-03, ...,
           2.24462058e-03, -1.52456667e-03,  3.91305285e-03],
         [-1.01398793e-03,  9.53671616e-03,  1.46581251e-02, ...,
           7.65887601e-03,  1.41440369e-02, -1.71044245e-04],
         [-1.04687642e-02, -8.56693648e-03,  3.51992506e-03, ...,
          -3.08121531e-03, -6.91359676e-03, -1.19565260e-02]],

        [[-3.10214609e-03, -1.74580247e-03,  4.19515837e-03, ...,
          -1.31797250e-02, -9.71959066e-03,  1.88785251e-02],
         [-7.30105676e-03, -4.72193910e-03,  3.04051372e-03, ...,
          -4.88789612e

In [67]:
vgg19.preprocess_input(X_train[0])

array([[[ -40.939003 ,  -54.779    ,  -64.68     ],
        [ -58.939003 ,  -70.779    ,  -80.68     ],
        [ -60.939003 ,  -68.779    ,  -73.68     ],
        ...,
        [   4.060997 ,   15.221001 ,   34.32     ],
        [  -1.939003 ,    8.221001 ,   28.32     ],
        [  -0.939003 ,    7.2210007,   24.32     ]],

       [[ -83.939    ,  -96.779    , -107.68     ],
        [-103.939    , -116.779    , -123.68     ],
        [-103.939    , -108.779    , -105.68     ],
        ...,
        [ -48.939003 ,  -28.779    ,   -0.6800003],
        [ -53.939003 ,  -33.779    ,   -4.6800003],
        [ -46.939003 ,  -29.779    ,   -1.6800003]],

       [[ -82.939    ,  -92.779    ,  -98.68     ],
        [-103.939    , -109.779    , -107.68     ],
        [ -95.939    ,  -89.779    ,  -74.68     ],
        ...,
        [ -53.939003 ,  -32.779    ,   -5.6800003],
        [ -53.939003 ,  -32.779    ,   -3.6800003],
        [ -61.939003 ,  -43.779    ,  -14.68     ]],

       ...,

      

In [68]:
preprocessed = vgg19.preprocess_input(X_train[:10])

In [69]:
preprocessed.shape

(10, 32, 32, 3)

In [75]:
resize(X_train[0], (224, 224), preserve_range = True)

array([[[ 33.71428571,  36.40816327,  36.48979592],
        [ 36.        ,  38.93877551,  39.18367347],
        [ 38.28571429,  41.46938776,  41.87755102],
        ...,
        [137.        , 107.97959184,  82.7755102 ],
        [137.14285714, 107.81632653,  82.26530612],
        [137.28571429, 107.65306122,  81.75510204]],

       [[ 39.85714286,  42.65306122,  42.75510204],
        [ 42.14285714,  45.10204082,  45.40816327],
        [ 44.42857143,  47.55102041,  48.06122449],
        ...,
        [140.85714286, 113.36734694,  89.46938776],
        [141.14285714, 113.30612245,  89.08163265],
        [141.42857143, 113.24489796,  88.69387755]],

       [[ 46.        ,  48.89795918,  49.02040816],
        [ 48.28571429,  51.26530612,  51.63265306],
        [ 50.57142857,  53.63265306,  54.24489796],
        ...,
        [144.71428571, 118.75510204,  96.16326531],
        [145.14285714, 118.79591837,  95.89795918],
        [145.57142857, 118.83673469,  95.63265306]],

       ...,

      

In [76]:
resize(X_train[0], (224, 224), preserve_range = True).astype(int).shape

(224, 224, 3)

In [79]:
resize(X_train[0], (224, 224), preserve_range = True).astype(np.uint8) # in colab the image is shown, but not here

array([[[ 33,  36,  36],
        [ 36,  38,  39],
        [ 38,  41,  41],
        ...,
        [137, 107,  82],
        [137, 107,  82],
        [137, 107,  81]],

       [[ 39,  42,  42],
        [ 42,  45,  45],
        [ 44,  47,  48],
        ...,
        [140, 113,  89],
        [141, 113,  89],
        [141, 113,  88]],

       [[ 46,  48,  49],
        [ 48,  51,  51],
        [ 50,  53,  54],
        ...,
        [144, 118,  96],
        [145, 118,  95],
        [145, 118,  95]],

       ...,

       [[173, 136, 101],
        [174, 138, 105],
        [176, 141, 109],
        ...,
        [121,  89,  68],
        [124,  93,  69],
        [128,  96,  70]],

       [[174, 136,  96],
        [175, 138, 101],
        [176, 140, 105],
        ...,
        [115,  83,  62],
        [118,  86,  63],
        [121,  89,  64]],

       [[174, 135,  92],
        [175, 137,  97],
        [177, 139, 102],
        ...,
        [109,  77,  56],
        [112,  80,  57],
        [115,  83,  58]]

In [81]:
images = np.array([resize(X_train[i], (224, 224), preserve_range = True) for i in range(50)])

In [82]:
images.shape

(50, 224, 224, 3)

In [83]:
preprocessed_images = vgg19.preprocess_input(images)

In [84]:
vgg_model.predict(preprocessed_images)



array([[6.60000253e-04, 1.54194823e-02, 1.05126810e-05, ...,
        4.37909359e-04, 5.19159134e-04, 6.72889200e-06],
       [8.17942123e-07, 7.23069491e-07, 2.74367338e-08, ...,
        2.63078075e-08, 1.37153654e-06, 1.80770996e-07],
       [1.10285684e-04, 3.02091867e-05, 5.56721170e-06, ...,
        5.39723396e-06, 1.61359305e-04, 4.75498155e-06],
       ...,
       [1.64104731e-05, 8.03369312e-06, 3.71299247e-05, ...,
        8.31878197e-07, 2.02112597e-06, 1.07212429e-07],
       [1.06359839e-04, 3.54378949e-06, 1.19230117e-06, ...,
        5.12081954e-07, 6.69041356e-06, 1.09936752e-06],
       [6.87307738e-06, 2.74593593e-04, 1.84506780e-06, ...,
        7.38498500e-07, 1.94472796e-05, 1.80933867e-07]], dtype=float32)

In [85]:
predictions = vgg_model.predict(preprocessed_images)



In [86]:
vgg19.decode_predictions(predictions)

[[('n03347037', 'fire_screen', 0.12565713),
  ('n04443257', 'tobacco_shop', 0.034944907),
  ('n02494079', 'squirrel_monkey', 0.021992981),
  ('n03871628', 'packet', 0.021060562),
  ('n01773549', 'barn_spider', 0.020223849)],
 [('n03796401', 'moving_van', 0.90137094),
  ('n04467665', 'trailer_truck', 0.04625424),
  ('n03776460', 'mobile_home', 0.030025419),
  ('n03417042', 'garbage_truck', 0.0032234343),
  ('n03895866', 'passenger_car', 0.0024076705)],
 [('n04428191', 'thresher', 0.2820687),
  ('n03134739', 'croquet_ball', 0.05490087),
  ('n03000684', 'chain_saw', 0.04629812),
  ('n03498962', 'hatchet', 0.027435793),
  ('n02950826', 'cannon', 0.02711591)],
 [('n01795545', 'black_grouse', 0.24535722),
  ('n02422106', 'hartebeest', 0.1578502),
  ('n02002724', 'black_stork', 0.084581904),
  ('n01871265', 'tusker', 0.043428022),
  ('n02114855', 'coyote', 0.033424266)],
 [('n03796401', 'moving_van', 0.7323819),
  ('n04467665', 'trailer_truck', 0.09475787),
  ('n02690373', 'airliner', 0.04141

In [89]:
tf.keras.backend.clear_session()

__ResNet__

In [90]:
resnet = resnet50.ResNet50()

In [91]:
resnet.summary()

Model: "resnet50"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, 224, 224, 3)]        0         []                            
                                                                                                  
 conv1_pad (ZeroPadding2D)   (None, 230, 230, 3)          0         ['input_1[0][0]']             
                                                                                                  
 conv1_conv (Conv2D)         (None, 112, 112, 64)         9472      ['conv1_pad[0][0]']           
                                                                                                  
 conv1_bn (BatchNormalizati  (None, 112, 112, 64)         256       ['conv1_conv[0][0]']          
 on)                                                                                       

In [96]:
inputs = Input((220, 220, 3))
conv1 = Conv2D(20, (3, 3), activation = "relu", padding = "same")(inputs)
conv2 = Conv2D(20, (3, 3), activation = "relu", padding = "same")(conv1)
conv3 = Conv2D(20, (3, 3), activation = "relu", padding = "same")(conv2)
conv4 = Conv2D(20, (3, 3), activation = "relu", padding = "same")(conv3)

add_result = Add()([conv1, conv4])

residual_block = Model(inputs = inputs, outputs = add_result)

In [97]:
residual_block.summary()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_4 (InputLayer)        [(None, 220, 220, 3)]        0         []                            
                                                                                                  
 conv2d_8 (Conv2D)           (None, 220, 220, 20)         560       ['input_4[0][0]']             
                                                                                                  
 conv2d_9 (Conv2D)           (None, 220, 220, 20)         3620      ['conv2d_8[0][0]']            
                                                                                                  
 conv2d_10 (Conv2D)          (None, 220, 220, 20)         3620      ['conv2d_9[0][0]']            
                                                                                            

In [101]:
[residual_block for _ in range(5)] # we can generate a number of blocks

[<keras.src.engine.functional.Functional at 0x2f33d0ad6d0>,
 <keras.src.engine.functional.Functional at 0x2f33d0ad6d0>,
 <keras.src.engine.functional.Functional at 0x2f33d0ad6d0>,
 <keras.src.engine.functional.Functional at 0x2f33d0ad6d0>,
 <keras.src.engine.functional.Functional at 0x2f33d0ad6d0>]

A convolution 1x1 is equvalent to a Dense layer. By this way, we can create a NN in a NN.

__Inception__ - apply different filters of different sizes, and than the network decides what's most useful. More features, more time needed, but less tuning.

In [104]:
# inception_resnet_v2.InceptionResNetV2().summary()

This is a very deep model.

The channels should be normalized before they enter the NN.

## __Encoder / Decoder Architecture__

Encoder -> CNN

Decoder -> output image with segmentation (of the same or e.g. biger size), text (description of an image)

There are skip-connections (residual connestions) between the complementary blocks of the encoder and decoder.

In [108]:
encoder = Sequential([
    Input((224, 224, 3)),
    Conv2D(256, (3, 3), activation = "relu", padding = "same"),      
    Conv2D(128, (3, 3), activation = "relu", padding = "same"),
    MaxPool2D(),      
    Conv2D(128, (3, 3), activation = "relu", padding = "same"),      
    Conv2D(64, (3, 3), activation = "relu", padding = "same"),
    MaxPool2D(), 
    Conv2D(64, (3, 3), activation = "relu", padding = "same"),      
    Conv2D(32, (3, 3), activation = "relu", padding = "same"),
    MaxPool2D()
])

In [109]:
encoder.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d_221 (Conv2D)         (None, 224, 224, 256)     7168      
                                                                 
 conv2d_222 (Conv2D)         (None, 224, 224, 128)     295040    
                                                                 
 max_pooling2d_6 (MaxPoolin  (None, 112, 112, 128)     0         
 g2D)                                                            
                                                                 
 conv2d_223 (Conv2D)         (None, 112, 112, 128)     147584    
                                                                 
 conv2d_224 (Conv2D)         (None, 112, 112, 64)      73792     
                                                                 
 max_pooling2d_7 (MaxPoolin  (None, 56, 56, 64)        0         
 g2D)                                                 

In [115]:
decoder = Sequential([
    Input((28, 28, 32)),
    UpSampling2D(),
    Conv2D(32, (3, 3), activation = "relu", padding = "same"),
    Conv2D(64, (3, 3), activation = "relu", padding = "same"),      
    UpSampling2D(), 
    Conv2D(64, (3, 3), activation = "relu", padding = "same"),
    Conv2D(128, (3, 3), activation = "relu", padding = "same"),      
    UpSampling2D(),      
    Conv2D(128, (3, 3), activation = "relu", padding = "same"),
    Conv2D(256, (3, 3), activation = "relu", padding = "same"), 

    Conv2D(3, (3, 3), padding = "same") 
])

In [116]:
decoder.summary()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 up_sampling2d_3 (UpSamplin  (None, 56, 56, 32)        0         
 g2D)                                                            
                                                                 
 conv2d_234 (Conv2D)         (None, 56, 56, 32)        9248      
                                                                 
 conv2d_235 (Conv2D)         (None, 56, 56, 64)        18496     
                                                                 
 up_sampling2d_4 (UpSamplin  (None, 112, 112, 64)      0         
 g2D)                                                            
                                                                 
 conv2d_236 (Conv2D)         (None, 112, 112, 64)      36928     
                                                                 
 conv2d_237 (Conv2D)         (None, 112, 112, 128)    

When we use UpSampling, we did not use Conv2, but Conv2DTranspose.

encoder => $ f(x) $

decoder => $ x = f^{-1}(f(x)) $ => $ f^{-1}(x) $

In [123]:
encoder_decoder = Sequential([
    Input((224, 224, 3)),
    Conv2D(256, (3, 3), activation = "relu", padding = "same"),      
    Conv2D(128, (3, 3), activation = "relu", padding = "same"),
    MaxPool2D(),      
    Conv2D(128, (3, 3), activation = "relu", padding = "same"),      
    Conv2D(64, (3, 3), activation = "relu", padding = "same"),
    MaxPool2D(), 
    Conv2D(64, (3, 3), activation = "relu", padding = "same"),      
    Conv2D(32, (3, 3), activation = "relu", padding = "same"),
    MaxPool2D(),

    UpSampling2D(),
    Conv2DTranspose(32, (3, 3), activation = "relu", padding = "same"),
    Conv2DTranspose(64, (3, 3), activation = "relu", padding = "same"),      
    UpSampling2D(), 
    Conv2DTranspose(64, (3, 3), activation = "relu", padding = "same"),
    Conv2DTranspose(128, (3, 3), activation = "relu", padding = "same"),      
    UpSampling2D(),      
    Conv2DTranspose(128, (3, 3), activation = "relu", padding = "same"),
    Conv2DTranspose(256, (3, 3), activation = "relu", padding = "same"), 

    Conv2DTranspose(3, (3, 3), padding = "same") 
])

In [124]:
encoder_decoder.summary()

Model: "sequential_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d_267 (Conv2D)         (None, 224, 224, 256)     7168      
                                                                 
 conv2d_268 (Conv2D)         (None, 224, 224, 128)     295040    
                                                                 
 max_pooling2d_15 (MaxPooli  (None, 112, 112, 128)     0         
 ng2D)                                                           
                                                                 
 conv2d_269 (Conv2D)         (None, 112, 112, 128)     147584    
                                                                 
 conv2d_270 (Conv2D)         (None, 112, 112, 64)      73792     
                                                                 
 max_pooling2d_16 (MaxPooli  (None, 56, 56, 64)        0         
 ng2D)                                                

In [163]:
# for reconstruction
class_encoder_decoder = Sequential([
    Input((32, 32, 3)),
    # Rescaling(1.0 / 255, input_shape = (32, 32, 3)),
    Conv2D(20, (3, 3), activation = "relu", padding = "same", kernel_initializer = "he_normal"),      
    Conv2D(16, (3, 3), activation = "relu", padding = "same"),
    MaxPool2D(),      
    Conv2D(16, (3, 3), activation = "relu", padding = "same"),      
    Conv2D(12, (3, 3), activation = "relu", padding = "same"),

    Conv2DTranspose(12, (3, 3), activation = "relu", padding = "same"),
    Conv2DTranspose(16, (3, 3), activation = "relu", padding = "same"),      
    UpSampling2D(),      
    Conv2DTranspose(16, (3, 3), activation = "relu", padding = "same"),
    Conv2DTranspose(20, (3, 3), activation = "relu", padding = "same"), 

    Conv2DTranspose(3, (3, 3), activation = "sigmoid", padding = "same")
])

In [164]:
class_encoder_decoder.summary()

Model: "sequential_15"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d_307 (Conv2D)         (None, 32, 32, 20)        560       
                                                                 
 conv2d_308 (Conv2D)         (None, 32, 32, 16)        2896      
                                                                 
 max_pooling2d_33 (MaxPooli  (None, 16, 16, 16)        0         
 ng2D)                                                           
                                                                 
 conv2d_309 (Conv2D)         (None, 16, 16, 16)        2320      
                                                                 
 conv2d_310 (Conv2D)         (None, 16, 16, 12)        1740      
                                                                 
 conv2d_transpose_48 (Conv2  (None, 16, 16, 12)        1308      
 DTranspose)                                         

In [165]:
class_encoder_decoder.compile(
    optimizer = "adam",
    loss = "mse",  # Mean Squared Error
)


In [168]:
# Normalize data for reconstruction
X_train = X_train.astype("float32") / 255.0
X_test = X_test.astype("float32") / 255.0


In [171]:
class_encoder_decoder.fit(X_train, X_train, validation_data = (X_test, X_test), epochs = 5, batch_size = 32)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x2f32c25cf90>

In [None]:
# Data Augmentation Pipeline

def preprocess_image(image, label):
    image = tf.cast(image, tf.float32) / 255.0  # Normalize to [0, 1]
    return image, label

# dataset = dataset.map(preprocess_image)