# GoogLeNet Implementation
In this notebook I'll reimplement famous CNN called GoogLeNet from [this](https://www.cv-foundation.org/openaccess/content_cvpr_2015/papers/Szegedy_Going_Deeper_With_2015_CVPR_paper.pdf) research paper.

In [173]:
# Import libraries
import tensorflow as tf
import numpy as np
import random

### Prepare Images

In [192]:
# This is the section where you can load images and preprocess them.
# GoogLeNet input shape is (224, 224, 3)
# In my case I'll create toy data
train_data = np.random.rand(10, 224, 224, 3)
train_labels = np.zeros([10, 1000])
for i in range(10):
    train_labels[i][random.randint(0, 999)] = 1
    
print(train_data.shape)
print(train_labels.shape)

(10, 224, 224, 3)
(10, 1000)


### Helping Funcions

In [216]:
def initialize_placeholders(img_shape, num_classes):

    # Initialize placeholder for inputs x
    x = tf.placeholder(tf.float32, [None,] + list(img_shape), name='x')
    
    # Initialize placeholder for labels y
    y = tf.placeholder(tf.float32, [None, num_classes], name='y')
    
    # Initialize placeholder for dropout
    keep_prob = tf.placeholder(tf.float32, name='keep_prob')
    
    return x, y, keep_prob

In [71]:
def Convolution(x_tensor, output_dims, k_size, stride, padding="VALID"):
    
    # Get the number of channels
    x_channels = x_tensor.get_shape().as_list()[-1]
    
    # Initialize weights/filters and biases
    W = tf.Variable(tf.truncated_normal([k_size[0], k_size[1], x_channels, output_dims], stddev=0.01))
    b = tf.Variable(tf.truncated_normal([output_dims], stddev=0.01))
    
    # Perform convolution
    conv = tf.nn.conv2d(x_tensor, W, (1, stride[0], stride[1], 1), padding)
    conv = tf.nn.bias_add(conv, b)
    
    # Relu activation function
    conv = tf.nn.relu(conv)
    
    return conv

In [149]:
def MaxPooling(x_tensor, k_size=(1,1), stride=(1,1), padding="VALID"):
    # Reshape into dimensions acceptable by Tensorflow
    filter_size = (1, k_size[0], k_size[1], 1)
    strides = (1, stride[0], stride[1], 1)
    
    # Perform max pooling
    pool = tf.nn.max_pool(x_tensor, filter_size, strides, padding)
    
    return pool

In [150]:
def AvgPooling(x_tensor, k_size=(1,1), stride=(1,1), padding="VALID"):
    # Reshape into dimensions acceptable by Tensorflow
    filter_size = (1, k_size[0], k_size[1], 1)
    strides = (1, stride[0], stride[1], 1)
    
    # Perform max pooling
    pool = tf.nn.avg_pool(x_tensor, filter_size, strides, padding)
    
    return pool

In [110]:
def Flatten(x_tensor):
    # Get tensor's shape
    shape = x_tensor.get_shape().as_list()
    
    # Get reshaped size
    reshaped_size = shape[1] * shape[2] * shape[3]
    
    # Flatten the matrix
    flat = tf.reshape(x_tensor, [-1, reshaped_size])
    
    return flat

In [111]:
def FullyConnected(x_tensor, num_output, activation="relu"):
    # Get the tensor's shape
    shape = x_tensor.get_shape().as_list()
    
    # Initialize weights and biases
    W = tf.Variable(tf.truncated_normal([shape[-1], num_output], stddev=0.01))
    b = tf.Variable(tf.truncated_normal([num_output], stddev=0.01))
    
    # Perform fully connected forward pass
    fc = tf.add(tf.matmul(x_tensor, W), b)
    
    # Activation
    if activation == "relu":
        fc = tf.nn.relu(fc)
        
    return fc

In [112]:
def LocalResponseNormalization(x_tensor):
    return tf.nn.local_response_normalization(x_tensor)

In [154]:
def Dropout(x_tensor, keep_prob):
    return tf.nn.dropout(x_tensor, keep_prob)

In [155]:
def Inception(x_tensor, dims):
    """
    dims = an array of output dimensions (conv_1x1, conv_3x3_reduced, conv_3x3, conv_5x5_reduced, conv_5x5, pool_reduction_dims)
    """
    
    # 1x1 Convolution
    conv_1x1 = Convolution(x_tensor, dims[0], k_size=(1,1), stride=(1,1), padding="SAME")
    
    # 3x3 Convolution
    conv_3x3_reduced = Convolution(x_tensor, dims[1], k_size=(1,1), stride=(1,1), padding="SAME")
    conv_3x3 = Convolution(conv_3x3_reduced, dims[2], k_size=(3,3), stride=(1,1), padding="SAME")
    
    # 5x5 Convolution
    conv_5x5_reduced = Convolution(x_tensor, dims[3], k_size=(1,1), stride=(1,1), padding="SAME")
    conv_5x5 = Convolution(conv_5x5_reduced, dims[4], k_size=(5,5), stride=(1,1), padding="SAME")
    
    # Pooling
    pool = MaxPooling(x_tensor, k_size=(3,3), stride=(1,1), padding="SAME")
    pool = Convolution(pool, dims[5], k_size=(1,1), stride=(1,1), padding="SAME")
    
    
    
    # Concatenate the layers depth-wise where axis(0=num examples, 1=height, 2=width, 3=depth)
    concat = tf.concat([conv_1x1, conv_3x3, conv_5x5, pool], axis=3)
    
    return concat

In [193]:
def get_batches(images, labels, batch_size):
    ''' Create a generator of image batches as a tuple (inputs, targets) '''
    
    n_batches = len(images) //batch_size
    
    # only full batches
    images = images[:n_batches*batch_size]
    
    for idx in range(0, len(images), batch_size):
        batch_inputs = images[idx:idx+batch_size]
        batch_labels = labels[idx:idx+batch_size]
        
        yield (np.array(batch_inputs) , np.array(batch_labels, ndmin=2))

### Create a network
According to google's research paper this network consists of 22 layers when counting only layers with parameters(27 if we also count pooling). The overall number of layers (independent building blocks) used for the construction of the network is about 100.

In [194]:
def GoogLeNet(X, num_classes, keep_prob):
    
    # First Block
    model = Convolution(X, 64, k_size=(7,7), stride=(2,2), padding="SAME")
    model = MaxPooling(model, k_size=(3,3), stride=(2,2), padding="SAME")
    model = LocalResponseNormalization(model)
    
    # Second Block
    model = Convolution(model, 64, k_size=(1,1), stride=(1,1), padding="SAME")
    model = Convolution(model, 192, k_size=(3,3), stride=(1,1), padding="SAME")
    model = MaxPooling(model, k_size=(3,3), stride=(2,2), padding="SAME")
    model = LocalResponseNormalization(model)
    
    ### INCEPTION LAYERS ###
    model = Inception(model, dims = (64, 96, 128, 16, 32, 32))
    model = Inception(model, dims = (128,128, 192, 32, 96, 64))
    
    # Maxpooling
    model = MaxPooling(model, k_size=(3,3), stride=(2,2), padding="SAME")
    
    # Inception Block
    model = Inception(model, dims = (192, 96, 208, 16, 48, 64))
    model = Inception(model, dims = (160, 112, 224, 24, 64, 64))
    model = Inception(model, dims = (128, 128, 256, 24, 64, 64))
    model = Inception(model, dims = (112, 144, 288, 32, 64, 64))
    model = Inception(model, dims = (256, 160, 320, 32, 128, 128))
    
    # Maxpooling
    model = MaxPooling(model, k_size=(3,3), stride=(2,2), padding="SAME")
    
    # Inception Block
    model = Inception(model, dims = (256, 160, 320, 32, 128, 128))
    model = Inception(model, dims = (384, 192, 384, 48, 128, 128))
    
    
    ### OUTPUT ###
    model = AvgPooling(model, k_size=(7,7), stride=(1,1), padding="VALID")
    model = Dropout(model, keep_prob)
    
    # Fully Connected Convolution
    fc = Convolution(model, num_classes, k_size=(1,1), stride=(1,1), padding="SAME")
    
    # Softmax activation function for choosing classes
    output = tf.nn.softmax(fc)

    return output

### Train

In [223]:
epochs = 60
batch_size = 10
learning_rate = 0.01
momentum_rate = 0.9
keep_probobability = 1
num_classes = 1000

In [224]:
tf.reset_default_graph

x, y, keep_prob = initialize_placeholders((224, 224, 3), num_classes)
out = GoogLeNet(x, num_classes, keep_prob)

cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(logits=out, labels=y))
optimizer = tf.train.GradientDescentOptimizer(learning_rate = learning_rate).minimize(cost)

In [225]:
with tf.Session() as sess:
    init = tf.global_variables_initializer()
    sess.run(init)
    
    for epoch in range(epochs):
        batches = get_batches(train_data, train_labels, batch_size)
        
        for i, (inputs, labels) in enumerate(batches):
            loss, _ = sess.run([cost, optimizer], feed_dict={x: inputs, y: labels, keep_prob: keep_probobability})
            print(loss)

6.907762
6.907762
6.907762
6.907762
6.907762


KeyboardInterrupt: 