# V&P project - Image Captioning

Preprocessing of data from COCO Dataset.  



## Set Up

Mounting Drive directory in which we will install data.
Importing needed libraries.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
% cd drive
% cd MyDrive
% cd VisionAndPerception
% cd V&P_PROJECT

/content/drive
/content/drive/MyDrive
/content/drive/MyDrive/VisionAndPerception
/content/drive/MyDrive/VisionAndPerception/V&P_PROJECT


In [None]:
import os
import json
import random
import nltk
nltk.download('punkt')
import pickle
from shutil import copyfile
from collections import Counter
from PIL import Image

import gensim.downloader
from gensim.models import KeyedVectors

import torch
from tqdm.notebook import tqdm
import numpy as np

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
!pip install --upgrade gensim

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import gensim
print(gensim.__version__)

4.2.0


## Data preprocessing

Downloading the dataset and assembling the data are one-time processing
steps. If you are rerunning the model to resume or restart the training, then
you do not need to repeat the steps inside this point.

### Download Train- Val- Test- Data

In [None]:
# Create directory coco_data to contain our data
!mkdir coco_data

mkdir: cannot create directory ‘coco_data’: File exists


In [None]:
# Download Train- Data
!wget http://images.cocodataset.org/zips/train2017.zip
!unzip ./train2017.zip -d ./coco_data/
!rm ./train2017.zip

# Download Val- Data
!wget http://images.cocodataset.org/zips/val2017.zip
!unzip ./val2017.zip -d ./coco_data/
!rm ./val2017.zip

"""# Download Test- Data
!wget http://images.cocodataset.org/zips/test2017.zip
!unzip ./test2017.zip -d ./coco_data/
!rm ./test2017.zip"""

# Download Train- and Val- annotations
!wget http://images.cocodataset.org/annotations/annotations_trainval2017.zip
!unzip ./annotations_trainval2017.zip -d ./coco_data/
!rm ./annotations_trainval2017.zip
!rm ./coco_data/annotations/person_keypoints_train2017.json
!rm ./coco_data/annotations/person_keypoints_val2017.json

### (class) Assembling the data

Since COCO Dataset is huge, in order to keep things simple and -above all- runnable on our GPU on Colab, we will consider:


*   4/91 different categories
*   1000 images for each category  

**This would damage the performance of the proposed model, since the effective train-dataset would be composed by few images. Anyway, it allows to present in a clearer way the results from the training session and the improvement due to the attention mechanism wrt the baseline model proposed.**



In [None]:
class limited_COCO_Dataset():
      def __init__(self,
                   train_instances_file,
                   train_caption_file,
                   val_instances_file,
                   val_caption_file,
                   category_list,
                   images_for_category):
          self.train_instances_file = train_instances_file
          self.train_caption_file = train_caption_file
          self.val_instances_file = val_instances_file        
          self.val_caption_file = val_caption_file          
          self.category_list = category_list
          self.images_for_category = images_for_category

          # Create limited dataset
          self.create_limited_dataset(self.train_instances_file, self.train_caption_file, "train")
          self.create_limited_dataset(self.val_instances_file, self.val_caption_file, "val")

          # Resize the images
          self.resize_images('./coco_data/images/train_images/', './coco_data/images/resized_train2017/', [256,256])
          self.resize_images('./coco_data/images/val_images/', './coco_data/images/resized_val2017/', [256,256])

      def create_limited_dataset(self, object_file, caption_file, typo):
          filtered_images = self.select_images_by_categories(object_file)
          captions, filtered_image_file_names = self.select_captions(caption_file, filtered_images)
          self.write_caption_file(captions, filtered_image_file_names, typo)

      def select_images_by_categories(self, object_file):
          category_dict = dict()
          for category_id in self.category_list:
              category_dict[category_id] = dict()
          all_images = dict()
          filtered_images = set()

          with open(object_file) as json_file:
            object_detections = json.load(json_file)
          
          for annotation in object_detections['annotations']:
              category_id = annotation['category_id']
              image_id = annotation['image_id']
              area = annotation['area']
              if category_id in self.category_list:
                if image_id not in category_dict[category_id]:
                  category_dict[category_id][image_id] = []
              if image_id not in all_images:
                all_images[image_id] = dict()
              if category_id not in all_images[image_id]:
                all_images[image_id][category_id] = area
              else:
                current_area = all_images[image_id][category_id]
                if area > current_area:
                  all_images[image_id][category_id] = area

          if self.images_for_category == -1:
            for category_id in category_dict:
                print("Processing category {}".format(category_id))
                filtered_images.update(category_dict[category_id].keys())
                print("  Filtered total {} images of category {}".format(len(category_dict[category_id].keys()), category_id))
          else:
            for image_id in all_images:
                areas = list(all_images[image_id].values())
                categories = list(all_images[image_id].keys())
                sorted_areas = sorted(areas, reverse=True)
                sorted_categories = []
                for area in sorted_areas:
                    sorted_categories.append(categories[areas.index(area)])
                all_images[image_id] = sorted_categories

            for category_id in category_dict:
                print("Processing category {}".format(category_id))
                for image_id in category_dict[category_id]:
                    category_dict[category_id][image_id] = all_images[image_id]
                prominance_index = 0
                prominent_image_ids = []
                while len(category_dict[category_id]) > 0 and len(prominent_image_ids) < self.images_for_category:
                      remaining_count = self.images_for_category - len(prominent_image_ids)
                      image_ids = []
                      for image_id in category_dict[category_id]:
                          if category_dict[category_id][image_id].index(category_id) == prominance_index:
                            image_ids.append(image_id)
                      for image_id in image_ids:
                          del category_dict[category_id][image_id]
                      if len(image_ids) <= remaining_count:
                        prominent_image_ids = prominent_image_ids + image_ids
                        if prominance_index > 4:
                          print(image_ids)
                        print("  Added all {} images at prominance_index {}".format(len(image_ids), prominance_index))
                      else:
                        random.shuffle(image_ids)
                        prominent_image_ids = prominent_image_ids + image_ids[0:remaining_count]
                        print("  Added {} images at prominance_index {} out of {} images".format(remaining_count, prominance_index, len(image_ids)))
                      prominance_index = prominance_index + 1
                filtered_images.update(prominent_image_ids)
                print("  Completed filtering of total {} images of category {}".format(len(prominent_image_ids), category_id))

            print("Processed all categories. Number of filtered images is {}".format(len(filtered_images)))
            return filtered_images

      def select_captions(self, caption_file, filtered_images):
          with open(caption_file) as json_file:
            captions = json.load(json_file)

          filtered_annotations = []
          for annotation in captions['annotations']:
              if annotation['image_id'] in filtered_images:
                filtered_annotations.append(annotation)
          captions['annotations'] = filtered_annotations
          print("Number of filtered annotations is {}".format(len(captions['annotations'])))

          images = []
          filtered_image_file_names = set()
          for image in captions['images']:
              if image['id'] in filtered_images:
                images.append(image)
                filtered_image_file_names.add(image['file_name'])
          captions['images'] = images
          print("Expected number of filtered images is {}, actual number is {}".format(len(filtered_images), len(captions['images'])))
          return captions, filtered_image_file_names
      
      def write_caption_file(self, captions, filtered_image_file_names, typo):
          with open("./coco_data/{}_captions.json".format(typo), 'w+') as output_file:
            json.dump(captions, output_file)

          for file_name in filtered_image_file_names:
              copyfile("./coco_data/images/{}2017/{}".format(typo, file_name),
                       "./coco_data/images/{}_images/{}".format(typo, file_name))
              
      def resize_images(self, input_path, output_path, new_size):
          if not os.path.exists(output_path):
            os.makedirs(output_path)
          image_files = os.listdir(input_path)
          num_images = len(image_files)
          for i, img in enumerate(image_files):
              img_full_path = os.path.join(input_path, img)
              with open(img_full_path, 'r+b') as f:
                with Image.open(f) as image:
                  image = image.resize(new_size, Image.ANTIALIAS)
                  img_sv_full_path = os.path.join(output_path, img)
                  image.save(img_sv_full_path, image.format)
              if (i+1) % 100 == 0 or (i+1) == num_images:
                print("Resized {} out of {} total images.".format(i+1, num_images))


### (class) Build the Vocabulary

In [None]:
class Vocabulary(object):
    def __init__(self):
        self.token_to_int = {}
        self.int_to_token = {}
        self.current_index = 0

    def __call__(self, token):
        if not token in self.token_to_int:
            return self.token_to_int['<unk>']
        return self.token_to_int[token]

    def __len__(self):
        return len(self.token_to_int)

    def add_token(self, token):
        if not token in self.token_to_int:
            self.token_to_int[token] = self.current_index
            self.int_to_token[self.current_index] = token
            self.current_index += 1

In [None]:
def build_vocabulary(json_path_train, json_path_val, threshold_occurences):
  with open(json_path_train) as json_file_train:
    captions_train = json.load(json_file_train)
  with open(json_path_val) as json_file_val:
    captions_val = json.load(json_file_val)
  counter = Counter()
  i = 0
  for annotation in captions_train['annotations']:
    i = i + 1
    caption = annotation['caption']
    tokens = nltk.tokenize.word_tokenize(caption.lower())
    counter.update(tokens)
    if i % 1000 == 0 or i == len(captions_train['annotations']):
      print("Tokenized {} out of total {} captions.".format(i, len(captions_train['annotations'])))
  j = 0
  for annotation in captions_val['annotations']:
    j = j + 1
    caption = annotation['caption']
    tokens = nltk.tokenize.word_tokenize(caption.lower())
    counter.update(tokens)
    if j % 1000 == 0 or j == len(captions_val['annotations']):
      print("Tokenized {} out of total {} captions.".format(j, len(captions_val['annotations'])))

  tokens = [tkn for tkn, i in counter.items() if i >= threshold_occurences]

  vocabulary = Vocabulary()
  vocabulary.add_token('<pad>')
  vocabulary.add_token('<start>')
  vocabulary.add_token('<end>')
  vocabulary.add_token('<unk>')

  for i, token in enumerate(tokens):
    vocabulary.add_token(token)
  return vocabulary

### Build Dataset, Vocabulary and Embeddings

In [None]:
# Build dataset
train_instances_file = "./coco_data/annotations/instances_train2017.json"
train_caption_file = "./coco_data/annotations/captions_train2017.json"
val_instances_file = "./coco_data/annotations/instances_val2017.json"
val_caption_file = "./coco_data/annotations/captions_val2017.json"
category_list = [5, 7, 37, 77]
images_for_category = 1000
## 5   airplane	    vehicle
## 7	 train	      vehicle
## 37	 sportsball	  sports
## 77	 cell phone	  electronic
!mkdir coco_data/train_images
!mkdir coco_data/val_images
limited_COCO_Dataset(train_instances_file,
                     train_caption_file,
                     val_instances_file,
                     val_caption_file,
                     category_list,
                     images_for_category)
!rm -rf ./coco_data/train2017
!rm -rf ./coco_data/val2017
!rm -rf ./coco_data/annotations
!rm -rf ./coco_data/train_images
!rm -rf ./coco_data/val_images

In [None]:
# Open pretrained word embeddings Gensim
weights = gensim.downloader.load("glove-wiki-gigaword-300") # vectors_dim = 300



In [None]:
# Build vocabulary
vocabulary = build_vocabulary('/content/drive/MyDrive/VisionAndPerception/V&P_PROJECT/coco_data/train_captions.json',
                              '/content/drive/MyDrive/VisionAndPerception/V&P_PROJECT/coco_data/val_captions.json',
                              threshold_occurences=1)
print("Total vocabulary size: {}".format(len(vocabulary)))

Tokenized 1000 out of total 20016 captions.
Tokenized 2000 out of total 20016 captions.
Tokenized 3000 out of total 20016 captions.
Tokenized 4000 out of total 20016 captions.
Tokenized 5000 out of total 20016 captions.
Tokenized 6000 out of total 20016 captions.
Tokenized 7000 out of total 20016 captions.
Tokenized 8000 out of total 20016 captions.
Tokenized 9000 out of total 20016 captions.
Tokenized 10000 out of total 20016 captions.
Tokenized 11000 out of total 20016 captions.
Tokenized 12000 out of total 20016 captions.
Tokenized 13000 out of total 20016 captions.
Tokenized 14000 out of total 20016 captions.
Tokenized 15000 out of total 20016 captions.
Tokenized 16000 out of total 20016 captions.
Tokenized 17000 out of total 20016 captions.
Tokenized 18000 out of total 20016 captions.
Tokenized 19000 out of total 20016 captions.
Tokenized 20000 out of total 20016 captions.
Tokenized 20016 out of total 20016 captions.
Tokenized 1000 out of total 3162 captions.
Tokenized 2000 out of

In [None]:
# Build embeddings
def adjust_weights_gensim(weights: KeyedVectors, vocab, vectors_dim):
    vectors = weights.vectors
    pretrained_embeddings = torch.randn(len(vocab), vectors_dim)
    initialised = 0
    progress_bar = tqdm(range(len(vocabulary.int_to_token)))
    for i, w in enumerate(vocabulary.int_to_token):
        if w == '<pad>':
          pretrained_embeddings[i] = torch.FloatTensor(np.random.rand(1, vectors.shape[1])).to("cuda")
        if w == '<unk>':
          pretrained_embeddings[i] = torch.FloatTensor(np.mean(vectors, axis=0, keepdims=True)).to("cuda")
        if w in weights.index_to_key:
          vec = weights[w]
          pretrained_embeddings[i] = torch.FloatTensor(vec).to("cuda")
        else: # if w not in weights.index_to_key
          pretrained_embeddings[i] = torch.FloatTensor(np.mean(vectors, axis=0, keepdims=True)).to("cuda")

        progress_bar.update()
    progress_bar.close()
    return pretrained_embeddings

In [None]:
vectors_dim= 300
embeddings = adjust_weights_gensim(weights, vocabulary, vectors_dim)

  0%|          | 0/5804 [00:00<?, ?it/s]

### Save Vocabulary and Embeddings

In [None]:
def save_vocabulary(vocabulary):
		destination_file = open("./coco_data/vocabulary.pkl", "wb")
		pickle.dump(vocabulary, destination_file)
		destination_file.close()
  
def save_embeddings(embeddings):
		destination_file = open("./coco_data/embeddings.pkl", "wb")
		pickle.dump(embeddings, destination_file)
		destination_file.close()

In [None]:
save_embeddings(embeddings)
save_vocabulary(vocabulary)