<a href="https://colab.research.google.com/github/darshan-hindocha/lab/blob/main/FGVC_on_the_stanford_car_dataset_using_BCNNs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Fine-Grained Visual Classification on the Stanford Car Dataset using Bilinear Convolutional Neural Networks


## Instructions for running this notebook

Download data from - https://www.kaggle.com/jutrera/stanford-car-dataset-by-classes-folder

Upload car_data.zip data to google drive. Alternatively try to upload car_data.zip to drive directly from - https://drive.google.com/file/d/16luuHDPxI2c9EhV2QvZKsKOeZyk9PPC_/view?usp=sharing 

Upload 'anno_test.csv', 'anno_train.csv' and 'names.csv' to colab temporary file directory. [files found here https://www.kaggle.com/jutrera/stanford-car-dataset-by-classes-folder ]

Mount Drive to Colab.

Copy path of 'car_data.zip' to image_folder_name variable below (might not need to change)

In [None]:
image_folder_name = '/content/drive/MyDrive/car_data.zip'

For neatness we collapse some cells as below. Double click the cell to see the contents. Make sure to run these cells also

In [None]:
#@title Imports
from tensorflow.keras.optimizers import RMSprop
from numpy import newaxis
from google.colab import files

from keras import backend as K
from keras.layers import Lambda, Reshape, Activation,Dense,Concatenate,Dropout
from keras.models import Model, Input, load_model
from keras.utils.vis_utils import plot_model

from keras.applications.vgg16 import VGG16
from keras.applications import ResNet101

from keras.initializers import glorot_normal
from tensorflow.keras.preprocessing.image import ImageDataGenerator

from time import time
import numpy as np
import cv2
import glob
import os
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
#upload the image dataset as 'image_dataset.zip' or alternatively rename the string below

from zipfile import ZipFile

file_name = image_folder_name

with ZipFile(file_name, 'r') as zip:
  zip.extractall()
  print('Done')

Done


Ensure that after unzipping, the car_data folder is in the following path

In [None]:
image_folder = '/content/car_data/'

In [None]:
num_classes = len(glob.glob(image_folder + '/train/*'))

## Pre-Processing (bounding boxes)

The data comes with bounding boxes that we apply to crop the images. Basic reshaping also done.

In [None]:
#@title Read bounding information CSVs

df_bound_boxes_train = pd.read_csv('anno_train.csv',header=None)
df_bound_boxes_test = pd.read_csv('anno_test.csv',header=None)
df_names = pd.read_csv('names.csv',header=None)

In [None]:
#@title Make folders for bounded images

os.mkdir('/content/car_data_bounded/')
os.mkdir('/content/car_data_bounded/train')
os.mkdir('/content/car_data_bounded/test')

for class_label in glob.glob(image_folder + '/train/*'):

  os.mkdir('/content/car_data_bounded/train/' + class_label.split('/')[-1])
  os.mkdir('/content/car_data_bounded/test/' + class_label.split('/')[-1])

In [None]:
## Crop and store bounded images

for t in ['train','test']:

  for file_name in glob.glob('car_data/'+t+'/*/*'):

    # extracting the row from the csv that has the details about the image in each iteration
    if t is 'train':
      filter = df_bound_boxes_train[0] == file_name.split('/')[-1]
      file_details = np.array(df_bound_boxes_train.loc[filter])
    else:
      filter = df_bound_boxes_test[0] == file_name.split('/')[-1]
      file_details = np.array(df_bound_boxes_test.loc[filter])
    
    _,x1,y1,x2,y2,file_class = file_details[0]  

    image = cv2.imread(file_name)
    height,width = image.shape[:2]
    
    crop_image = image[y1:y2, x1:x2]

    cv2.imwrite('/content/car_data_bounded/'+t+'/'+file_name.split('/')[-2]+'/'+file_name.split('/')[-1], crop_image)

## Data Augmentation

Using the Keras ImageDataGenerator Tool

In [None]:
## ImageDataGenerator uses file path to obtain the class of images

base_dir = 'car_data_bounded'
train_dir = os.path.join(base_dir, 'train')
validation_dir = os.path.join(base_dir, 'test')


train_datagen = ImageDataGenerator(
    rescale=1./255,
    rotation_range=10,
    shear_range=0.2,
    horizontal_flip=True)


## Validation Data is NOT augmented
val_datagen = ImageDataGenerator(rescale=1./255)

## Flow training images in batches of 32 using train_datagen generator
train_generator = train_datagen.flow_from_directory(
        train_dir,  # This is the source directory for training images
        target_size=(150, 150),  # All images will be resized to 150x150
        batch_size= 32)
        #,save_to_dir='augmented_data')

## Flow validation images in batches of 32 using val_datagen generator
validation_generator = val_datagen.flow_from_directory(
        validation_dir,
        target_size=(150, 150),
        batch_size = 32)


Found 8144 images belonging to 196 classes.
Found 8041 images belonging to 196 classes.


In [None]:
#@title Keras Bilinear Layers


## We use the following Layer operations from: https://github.com/ryanfwy/BCNN-keras-clean
def _outer_product(x):
    return K.batch_dot(x[0], x[1], axes=[1, 1]) / x[0].get_shape().as_list()[1]

def _signed_sqrt(x):
    return K.sign(x) * K.sqrt(K.abs(x) + 1e-9)

def _l2_normalize(x, axis=-1):
    return K.l2_normalize(x, axis=axis)



## Our bilinear layer - not used
def bilinear_layer(inputs):
    x1,x2 = inputs
    #x1  = input from stream A, with shape (batchsize,height,width,channel)
    #x2  = input from stream B, with shape (batchsize,height,width,channel)

    #if you want to print the shape of this
    #print(np.einsum('bmc,bnc->bmn',x1.reshape(2,289,64),x2.reshape(2,289,64)).reshape(2,83521).shape)
    batch_size = x1.shape[0]
    channels = x1.shape[3]
    h,w = x1.shape[1],x1.shape[2]
    h_w = h*w
    out = tf.einsum('bmc,bnc->bmn',K.reshape(x1,(batch_size,h_w,channels)),K.reshape(x2,(batch_size,h_w,channels)))

    return K.reshape(out,(batch_size,h_w*h_w))


## BCNN-VGG16 (196 classes) original implementation with transfer learning

In [None]:
## function to create model

def create_model():
  
  # Our input feature map is 150x150x3: 150x150 for the image pixels, and 3 for
  # the three color channels: R, G, and B
  img_input1 = Input(shape=[150, 150, 3])

  ## Stream A - Transfer Learning
  stream_A = VGG16(input_tensor=img_input1,include_top=False,weights='imagenet')

  ## Fix layers
  for l in stream_A.layers:
    l.trainable = False
    l._name = l._name + '1'

  output_stream_A = stream_A.layers[18].output
  A_out_shape = stream_A.layers[18].output_shape
  output_stream_A = Reshape([A_out_shape[1]*A_out_shape[2],A_out_shape[3]])(output_stream_A)


  ## Stream B - Transfer Learning
  stream_B = VGG16(input_tensor = img_input1,include_top=False,weights='imagenet')
  for l in stream_B.layers:
    l.trainable = False

  
  output_stream_B = stream_B.layers[18].output
  B_out_shape = stream_B.layers[18].output_shape
  output_stream_B = Reshape([B_out_shape[1]*B_out_shape[2],B_out_shape[3]])(output_stream_B)


  ## Bilinear layer
  x = Lambda(_outer_product)([output_stream_A,output_stream_B])
  x = Reshape([A_out_shape[-1]*B_out_shape[-1]])(x)

  # Signed square-root Layer
  x = Lambda(_signed_sqrt)(x)
  # L2 normalization Layer
  x = Lambda(_l2_normalize)(x)

  # Create a fully connected layer with ReLU activation and 196 hidden units
  x = Dense(num_classes,kernel_initializer=glorot_normal())(x)
  output = Activation('softmax')(x)

  bcnn_model = Model( img_input1, output)

  ## Configure and compile the model
  bcnn_model.compile(loss='categorical_crossentropy',
                optimizer='adam',
                metrics=['accuracy'])
  return bcnn_model

In [None]:
bcnn_model = create_model()

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/vgg16/vgg16_weights_tf_dim_ordering_tf_kernels_notop.h5


Running the training in 10 epoch steps so that if the runtime collapse we can save our results

In [None]:
def run_training():
  for group in range(10):
    print('Training Group: ',group)
    history = bcnn_model.fit(train_generator,epochs=10,validation_data=validation_generator,verbose=2)
   
    hist_df = pd.DataFrame(history.history)
    hist_df.to_csv(f'/content/drive/MyDrive/history{group}.csv')

    bcnn_model.save('/content/drive/MyDrive/bcnn_model.h5', overwrite=True)
  pass
run_training()

## BCNN-VGG16 (196 classes) w/ Dropout Layer + Transfer Learning

In [None]:
def create_v2_model():
  
  # Our input feature map is 150x150x3: 150x150 for the image pixels, and 3 for
  # the three color channels: R, G, and B
  img_input1 = Input(shape=[150, 150, 3])

  ## Stream A - Transfer Learning
  stream_A = VGG16(input_tensor=img_input1,include_top=False,weights='imagenet')

  ## Fix layers
  for l in stream_A.layers:
    l.trainable = False
    l._name = l._name + '1'

  output_stream_A = stream_A.layers[18].output
  A_out_shape = stream_A.layers[18].output_shape
  output_stream_A = Reshape([A_out_shape[1]*A_out_shape[2],A_out_shape[3]])(output_stream_A)


  ## Stream B - Transfer Learning
  stream_B = VGG16(input_tensor = img_input1,include_top=False,weights='imagenet')
  for l in stream_B.layers:
    l.trainable = False

  
  output_stream_B = stream_B.layers[18].output
  B_out_shape = stream_B.layers[18].output_shape
  output_stream_B = Reshape([B_out_shape[1]*B_out_shape[2],B_out_shape[3]])(output_stream_B)


  ## Bilinear layer
  x = Lambda(_outer_product)([output_stream_A,output_stream_B])
  x = Reshape([A_out_shape[-1]*B_out_shape[-1]])(x)

  # Signed square-root Layer
  x = Lambda(_signed_sqrt)(x)
  # L2 normalization Layer
  x = Lambda(_l2_normalize)(x)
  
  x = Dropout(0.5)(x)

  x = Dense(num_classes,kernel_initializer=glorot_normal())(x)
  output = Activation('softmax')(x)

  bcnn_v2_model = Model( img_input1, output)

  # Configure and compile the model
  bcnn_v2_model.compile(loss='categorical_crossentropy',
                optimizer='adam',
                metrics=['accuracy'])
  
  return bcnn_v2_model


We tried this but considered it better to re-run the new model from scratch
```
model_overfit = load_model('/content/drive/MyDrive/bcnn_model.h5')

overfit_weights = [layer.get_weights() for layer in model_overfit.layers]
  
for i in range(len(overfit_weights)-2):
  bcnn_v2_model.layers[i].set_weights(overfit_weights[i])
```



In [None]:
bcnn_v2_model = create_v2_model()

In [None]:
def run_training():
  for group in range(10):
    print('Training Group: ',group)
    history = bcnn_v2_model.fit(train_generator,epochs=10,validation_data=validation_generator,verbose=2)
   
    hist_df = pd.DataFrame(history.history)
    hist_df.to_csv(f'/content/drive/MyDrive/history_v2_{group}.csv')

    
    bcnn_v2_model.save('/content/drive/MyDrive/bcnn_v2_model.h5', overwrite=True)
  pass
run_training()

Training Group:  0
Epoch 1/10
255/255 - 100s - loss: 0.5342 - accuracy: 0.9584 - val_loss: 2.0724 - val_accuracy: 0.5099
Epoch 2/10
255/255 - 100s - loss: 0.5161 - accuracy: 0.9605 - val_loss: 2.0572 - val_accuracy: 0.5136
Epoch 3/10
255/255 - 100s - loss: 0.4877 - accuracy: 0.9629 - val_loss: 2.0473 - val_accuracy: 0.5115
Epoch 4/10
255/255 - 100s - loss: 0.4658 - accuracy: 0.9683 - val_loss: 2.0291 - val_accuracy: 0.5155
Epoch 5/10
255/255 - 100s - loss: 0.4373 - accuracy: 0.9732 - val_loss: 2.0294 - val_accuracy: 0.5192
Epoch 6/10
255/255 - 99s - loss: 0.4248 - accuracy: 0.9708 - val_loss: 2.0289 - val_accuracy: 0.5195
Epoch 7/10
255/255 - 100s - loss: 0.3988 - accuracy: 0.9727 - val_loss: 2.0200 - val_accuracy: 0.5186
Epoch 8/10
255/255 - 101s - loss: 0.3811 - accuracy: 0.9742 - val_loss: 1.9868 - val_accuracy: 0.5248
Epoch 9/10
255/255 - 101s - loss: 0.3622 - accuracy: 0.9786 - val_loss: 1.9929 - val_accuracy: 0.5237
Epoch 10/10
255/255 - 100s - loss: 0.3519 - accuracy: 0.9802 - v

## BCNN-ResNet101 (196 classes) w/ Dropout Layer + transfer learning

In [None]:
def create_resnet_model():
  # Our input feature map is 150x150x3: 150x150 for the image pixels, and 3 for
  # the three color channels: R, G, and B
  img_input1 = Input(shape=[150, 150, 3])

  ##Transfer Learning Stream A
  stream_A = ResNet101(include_top=False,weights="imagenet",input_tensor=img_input1)

  ## We don't fix the weights -  we change the layer names so it doesn't clash with other stream
  for l in stream_A.layers:
    l._name = l._name + '1'

  output_stream_A = stream_A.layers[18].output
  A_out_shape = stream_A.layers[18].output_shape

  output_stream_A = Reshape([A_out_shape[1]*A_out_shape[2],A_out_shape[3]])(output_stream_A)


  ## Transfer Learning Stream B
  stream_B = ResNet101(include_top=False,weights="imagenet",input_tensor=img_input1)

  output_stream_B = stream_B.layers[18].output
  B_out_shape = stream_B.layers[18].output_shape

  output_stream_B = Reshape([B_out_shape[1]*B_out_shape[2],B_out_shape[3]])(output_stream_B)

  ## Bilinear Layer
  x = Lambda(_outer_product)([output_stream_A,output_stream_B])
  x = Reshape([A_out_shape[-1]*B_out_shape[-1]])(x)
  # Signed square-root
  x = Lambda(_signed_sqrt)(x)
  # L2 normalization
  x = Lambda(_l2_normalize)(x)
  
  x = Dropout(0.5)(x)
  x = Dense(num_classes,kernel_initializer=glorot_normal())(x)
  output = Activation('softmax')(x)
  
  bcnn_resnet_model = Model(img_input1, output)

  # Configure and compile the model

  bcnn_resnet_model.compile(loss='categorical_crossentropy',
                optimizer='adam',
                metrics=['accuracy'])

  return bcnn_resnet_model

In [None]:
bcnn_resnet_model = create_resnet_model()

In [None]:
def run_training():
  for group in range(10):
    print('Training Group: ',group)
    history = bcnn_resnet_model.fit(train_generator,epochs=10,validation_data=validation_generator,verbose=2)
   
    hist_df = pd.DataFrame(history.history)
    hist_df.to_csv(f'/content/drive/MyDrive/history_resnet_{group}.csv')

    
    bcnn_resnet_model.save_weights('/content/drive/MyDrive/bcnn_resnet_modelweights.h5',overwrite=True)
  pass
run_training()

Training Group:  0
Epoch 1/10
255/255 - 98s - loss: 3.2011 - accuracy: 0.2933 - val_loss: 4.1136 - val_accuracy: 0.1420
Epoch 2/10
255/255 - 97s - loss: 3.0080 - accuracy: 0.3298 - val_loss: 4.2364 - val_accuracy: 0.1415
Epoch 3/10
255/255 - 97s - loss: 2.8412 - accuracy: 0.3691 - val_loss: 3.6135 - val_accuracy: 0.2148
Epoch 4/10
255/255 - 97s - loss: 2.6759 - accuracy: 0.3997 - val_loss: 4.0627 - val_accuracy: 0.1604
Epoch 5/10
255/255 - 97s - loss: 2.5249 - accuracy: 0.4447 - val_loss: 3.7064 - val_accuracy: 0.2110
Epoch 6/10
255/255 - 97s - loss: 2.3747 - accuracy: 0.4711 - val_loss: 3.6787 - val_accuracy: 0.1969
Epoch 7/10
255/255 - 97s - loss: 2.2159 - accuracy: 0.5183 - val_loss: 3.5206 - val_accuracy: 0.2450
Epoch 8/10
255/255 - 97s - loss: 2.1065 - accuracy: 0.5379 - val_loss: 3.6415 - val_accuracy: 0.2247
Epoch 9/10
255/255 - 97s - loss: 1.9763 - accuracy: 0.5699 - val_loss: 3.3521 - val_accuracy: 0.2720
Epoch 10/10
255/255 - 98s - loss: 1.8648 - accuracy: 0.5961 - val_loss: 