In [None]:
import tensorflow as tf

# You'll generate plots of attention in order to see which parts of an image
# your model focuses on during captioning
import matplotlib.pyplot as plt

import collections
import random
import numpy as np
import os
import time
import json
from PIL import Image

In [None]:
# # Download caption annotation files
# annotation_folder = '/annotations/'
# if not os.path.exists(os.path.abspath('.') + annotation_folder):
#   annotation_zip = tf.keras.utils.get_file('captions.zip',
#                                            cache_subdir=os.path.abspath('.'),
#                                            origin='http://images.cocodataset.org/annotations/annotations_trainval2014.zip',
#                                            extract=True)
#   annotation_file = os.path.dirname(annotation_zip)+'/annotations/captions_train2014.json'
#   os.remove(annotation_zip)

# # Download image files
# image_folder = '/train2014/'
# if not os.path.exists(os.path.abspath('.') + image_folder):
#   image_zip = tf.keras.utils.get_file('train2014.zip',
#                                       cache_subdir=os.path.abspath('.'),
#                                       origin='http://images.cocodataset.org/zips/train2014.zip',
#                                       extract=True)
#   PATH = os.path.dirname(image_zip) + image_folder
#   os.remove(image_zip)
# else:
#   PATH = os.path.abspath('.') + image_folder

In [None]:
!wget https://github.com/jbrownlee/Datasets/releases/download/Flickr8k/Flickr8k_Dataset.zip

In [None]:
!wget https://github.com/jbrownlee/Datasets/releases/download/Flickr8k/Flickr8k_text.zip

In [None]:
!unzip Flickr8k_Dataset.zip
!unzip Flickr8k_text.zip

In [None]:
!pip install nptyping

# Extracting photo features

In [None]:
from os import listdir
from os import path
from pickle import dump
from keras.applications.vgg16 import VGG16, preprocess_input
from keras.preprocessing.image import load_img, img_to_array
from keras.models import Model

import pathlib
import typing
from typing import Dict
from nptyping import NDArray, Int, Shape

Path = typing.Union[str, pathlib.Path]

# extract features from each photo in the directory
def extract_features(directory: Path, is_directory: bool = True)  -> Dict[str, NDArray[Shape["1, 4096"], Int]]:
  model = VGG16()
  # model.layers.pop() No effect

  model= Model(inputs=model.inputs, outputs=model.layers[-2].output)

  model.summary()
  features = dict()
  if not is_directory:
    image = load_img(directory, target_size=(224, 224))
    image = img_to_array(image)
    image = image.reshape(1, image.shape[0], image.shape[1], image.shape[2])
    image = preprocess_input(image) # doubt (standardize or normalize or centering the image)
    feature = model.predict(image, verbose=0)
    # print(type(feature), 'Shape: ', feature.shape)
    # <class 'numpy.ndarray'> Shape:  (1, 4096)
    image_id = directory.split('/')[-1]
    features[image_id] = feature
  else:
    for name in listdir(directory):
      filename = path.join(directory, name)
      image = load_img(filename, target_size=(224, 224))
      image = img_to_array(image)
      image = image.reshape(1, image.shape[0], image.shape[1], image.shape[2])
      image = preprocess_input(image) # doubt
      feature = model.predict(image, verbose=0)
      # print(type(feature), 'Shape: ', feature.shape)
      # <class 'numpy.ndarray'> Shape:  (1, 4096)
      image_id = name.split('.')[0]
      features[image_id] = feature
      print('>%s' % name)
  return features


# directory = '/content/Flicker8k_Dataset'
directory = './Flicker8k_Dataset'

features = extract_features(directory)
print('Extracted features : %d' % len(features))
dump(features, open('features.pkl', 'wb'))

# Create descriptions.txt file

In [None]:
import string 
import re
from typing import TextIO, NoReturn, List, Dict

def load_doc(filename : TextIO) -> str:
  file = open(filename, 'r')
  text = file.read()
  file.close()
  return text

def load_descriptions(doc: str) -> Dict[str, List[str]]:
  mapping = dict()

  for line in doc.split('\n'):
    '''
    line -> 1000268201_693b08cb0e.jpg#0	A child in a pink dress is climbing up a set of stairs in an entry way .
    '''
    tokens = line.split()
    if len(line) < 2:
      continue
    # image_id -> 1000268201_693b08cb0e.jpg#0
    image_id, image_desc = tokens[0], tokens[1:]
    image_id = image_id.split('.')[0] # image_id -> 1000268201_693b08cb0e
    # image_desc -> A child in a pink dress is climbing up a set of stairs in an entry way.
    image_desc = ' '.join(image_desc)
    if image_id not in mapping:
      mapping[image_id] = list() # important
    # {'1000268201_693b08cb0e.jpg#0' : [' A child in a pink dress is climbing up a set of stairs in an entry way .', 'A girl going into a wooden building .', [..3 more] }
    mapping[image_id].append(image_desc)

  return mapping

def clean_descriptions(descriptions : Dict) -> NoReturn:
  re_punc = re.compile('[%s]' % re.escape(string.punctuation))
  for key, desc_list in descriptions.items():
    # desc_list -> ['A child in a pink dress is climbing up a set of stairs in an entry way .', 'A girl going into a wooden building .' ]
    for i in range(len(desc_list)):
      # desc_list[i] -> 'A child in a pink dress is climbing up a set of stairs in an entry way .'
      desc = desc_list[i]
      desc = desc.split()
      desc = [word.lower() for word in desc]
      desc = [re_punc.sub('', w) for w in desc]
      desc = [word for word in desc if len(word) > 1]
      desc = [word for word in desc if word.isalpha()]
      desc_list[i] = ' '.join(desc)


# Count the number of unique words present in descriptions loaded from "flickr8k.token.txt"
def to_vocabulary(descriptions: Dict[str, List[str]]):
  all_desc = set()
  for key in descriptions.keys():
    # key - 1000268201_693b08cb0e
    # descriptions[key] - List[str]
    [all_desc.update(d.split()) for d in descriptions[key]]
  
  return all_desc

def save_descriptions(descriptions : Dict[str, List[str]], filename):
  lines = list()
  for key, desc_list in descriptions.items():
    for desc in desc_list:
      lines.append(key + ' ' + desc)
  # lines - ['1000268201_693b08cb0e.jpg#0	A child in a pink dress is climbing up a set of stairs in an entry way .', '...']
  data = '\n'.join(lines)
  file = open(filename, 'w')
  file.write(data)
  file.close()

# consists of all tokens
# filename = '/content/Flickr8k.token.txt'
filename = './Flickr8k.token.txt'
'''
doc:
  1000268201_693b08cb0e.jpg#0	A child in a pink dress is climbing up a set of stairs in an entry way .
  1000268201_693b08cb0e.jpg#1	A girl going into a wooden building .
  1000268201_693b08cb0e.jpg#2	A little girl climbing into a wooden playhouse .
  1000268201_693b08cb0e.jpg#3
  '''
doc : str = load_doc(filename)
descriptions : Dict[str, List[str]] = load_descriptions(doc)
print('Loaded: %d' % len(descriptions))

clean_descriptions(descriptions)
vocabulary = to_vocabulary(descriptions)
print('Vocabulary Size: %d' %len(vocabulary))

# The end result
save_descriptions(descriptions, 'descriptions.txt')

'''
descriptions.txt
1000268201_693b08cb0e child in pink dress is climbing up set of stairs in an entry way
1000268201_693b08cb0e girl going into wooden building
1000268201_693b08cb0e little girl climbing into wooden playhouse
1000268201_693b08cb0e little girl climbing the stairs to her playhouse
1000268201_693b08cb0e
'''

In [None]:
!wget https://huggingface.co/stanfordnlp/glove/resolve/main/glove.6B.zip

In [None]:
!unzip glove.6B.zip

In [None]:
from numpy import array
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical, plot_model
from keras.models import Model
from keras.layers import Input, Dense, LSTM, Embedding, Dropout
from keras.layers.merge import add
from keras.callbacks import ModelCheckpoint



# Tip: For better type hints use pycharm 
def to_lines(descriptions: Dict[str, List[str]]) -> List[str]:
  all_desc = list()
  for key in descriptions.keys():
    [all_desc.append(d) for d in descriptions[key]]
  # print(all_desc)
  # for training dataset - length of all_desc is 30,000
  return all_desc                         

def create_tokenizer(descriptions: Dict[str, List[str]]):
  # lines - ['a child is playing', 'a dog is running across the beach', '...']
  lines = to_lines(descriptions)
  tokenizer = Tokenizer()
  # lines should be of format - List[str] or List[List[tokens]] Ref: Ctrl + click
  # so that str is split by whitespace(default) by tokenizer and each word is assigned with a unique index
  tokenizer.fit_on_texts(lines)
  return tokenizer

def max_length(descriptions: Dict[str, List[str]]):
  lines: List[str] = to_lines(descriptions)
  return max(len(d.split()) for d in lines)

# def create_sequences(tokenizer: Tokenizer, max_length: int, descriptions : Dict[str, List[str]], photos: Dict, vocab_size: int):
#   X1, X2, y = list(), list(), list()
#   for key, desc_list in descriptions.items():
#     for desc in desc_list:
#       seq: List[int] = tokenizer.texts_to_sequences([desc])[0]
#       for i in range(1, len(seq)):
#         in_seq, out_seq = seq[:i], seq[i]
#         in_seq: NDArray[Shape["34, "], Int] = pad_sequences([in_seq], maxlen=max_length)[0]
#         out_seq: NDArray[Shape["1, 7579"], Int] = to_categorical([out_seq], num_classes=vocab_size)[0]
#         X1.append(photos[key][0])
#         X2.append(in_seq)
#         y.append(out_seq)

#   return array(X1), array(X2), array(y)

def create_sequences(tokenizer: Tokenizer, max_length: int, desc_list: List[str], photo: Dict[str, NDArray[Shape["1, 4096"], Int]], vocab_size: int):
	X1, X2, y = list(), list(), list()
	# walk through each description for the image
	for desc in desc_list:
		# encode the sequence
		seq = tokenizer.texts_to_sequences([desc])[0]
		# split one sequence into multiple X,y pairs
		for i in range(1, len(seq)):
			# split into input and output pair
			in_seq, out_seq = seq[:i], seq[i]
			# pad input sequence
			in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
			# encode output sequence
			out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
			# store
			X1.append(photo)
			X2.append(in_seq)
			y.append(out_seq)
	return array(X1), array(X2), array(y)

# data generator, intended to be used in a call to model.fit_generator()
def data_generator(descriptions, photos, tokenizer, max_length, vocab_size):
	# loop for ever over images
	while 1:
		for key, desc_list in descriptions.items():
			# retrieve the photo feature
			photo = photos[key][0]
			in_img, in_seq, out_word = create_sequences(tokenizer, max_length, desc_list, photo, vocab_size)
			yield [in_img, in_seq], out_word

# Prepare the Training Dataset

In [None]:
from pickle import load
from typing import Set
from typing import TextIO, List, Set, Dict
from nptyping import NDArray, Int, Shape


def load_set(filename : TextIO) -> Set[str]:
  doc: str = load_doc(filename)
  dataset = list()
  for line in doc.split('\n'):
    # line -> 2513260012_03d33305cf.jpg
    if len(line) < 1:
      continue
    # identifier -> 2513260012_03d33305cf
    identifier = line.split('.')[0]
    dataset.append(identifier)
  # dataset = ['2513260012_03d33305cf', '2903617548_d3e38d7f88', ...]
  return set(dataset)


def load_clean_descriptions(filename: TextIO, dataset: List[str]) -> Dict[str, List[str]]:
  doc = load_doc(filename)
  descriptions = dict()
  for line in doc.split('\n'):
  # line -> 1000268201_693b08cb0e child in pink dress is climbing up set of stairs in an entry way
    tokens = line.split()
    image_id, image_desc = tokens[0], tokens[1:]
    # image_id -> 1000268201_693b08cb0e
    if image_id in dataset: 
      if image_id not in descriptions: # initially this is an emtpy entry
        descriptions[image_id] = list()
      # desc -> 'startseq  child in pink dress is climbing up set of stairs in an entry way endseq'
      desc = 'startseq ' + ' '.join(image_desc) + ' endseq' # Doubt
      # descriptions -> {'1000268201_693b08cb0e' : List[5 captions]}
      descriptions[image_id].append(desc)
  return descriptions


def load_photo_features(filename, dataset: Set[str]) -> Dict[str, NDArray[Shape["1, 4096"], Int]]:
  # dataset = {'2513260012_03d33305cf', '2903617548_d3e38d7f88', ...}
  all_features = load(open(filename, 'rb')) # rb?
  # print(type(all_features))
  # print(all_features['2513260012_03d33305cf'])
  features = {k: all_features[k] for k in dataset}
  return features

'''
Flickr_8k.trainImages.txt/ train

2513260012_03d33305cf.jpg
2903617548_d3e38d7f88.jpg
3338291921_fe7ae0c8f8.jpg
488416045_1c6d903fe0.jpg
2644326817_8f45080b87.jpg
'''
# filename = '/content/Flickr_8k.trainImages.txt'
filename = './Flickr_8k.trainImages.txt'

train = load_set(filename)
# train - {"1000268201_693b08cb0e", "..."}

print('Length of the Training Dataset: %d' %len(train)) # 6000 id's and each id with 5 descriptions => 30,000 {id, description} pairs

# train_descriptions - {"1000268201_693b08cb0e": ["child in pink dress is climbing up set of stairs in an entry way", "...."]}
train_descriptions: Dict[str, List[str]] = load_clean_descriptions('descriptions.txt', train)
print('Descriptions: train=%d' %len(train_descriptions)) # the number of keys present in train_descriptions dataset

train_features = load_photo_features('./features.pkl', train)
print('Photos: train=%d' %len(train_features))

# Training the Model

In [None]:
# train_descriptions - {"1000268201_693b08cb0e": ["child in pink dress is climbing up set of stairs in an entry way", "...."]}
tokenizer = create_tokenizer(train_descriptions)
vocab_size = len(tokenizer.word_index) + 1
print('Vocabulary Size: %d' %vocab_size)

# line with maximum words
max_length = max_length(train_descriptions)
print('Description Length: %d' %max_length)

# X1train, X2train, ytrain = create_sequences(tokenizer, max_length, train_descriptions, train_features)

# filename = '/content/Flickr_8k.devImages.txt'
filename = './Flickr_8k.devImages.txt'

test = load_set(filename)
print('Dataset: %d'%len(test))

test_descriptions = load_clean_descriptions('descriptions.txt', test)
print('Descriptions: test = %d' %len(test_descriptions))

test_features = load_photo_features('features.pkl', test)
print('Photos: test=%d' %len(test_features))

In [None]:
from nptyping.typing_ import Float

embeddings_index: Dict[str, NDArray[Shape["200, 1"], Float]] = {} 
f = open('./glove.6B.200d.txt', encoding="utf-8")

'''
glove.6B.200d.txt
the -0.071549 0.093459 0.023738 -0.090339 0.056123 0.32547 -0.39796 -0.092139 0.061181 -0.1895 0.13061 0.14349 0.011479 0.38158 0.5403 -0.14088 0.24315 0.23036 -0.55339 0.048154 0.45662 3.2338 0.020199 0.049019 -0.014132 0.076017 -0.11527 0.2006 -0.077657 0.24328 0.16368 -0.34118 -0.06607 0.10152 0.038232 -0.17668 -0.88153 -0.33895 -0.035481 -0.55095 -0.016899 -0.43982 0.039004 0.40447 -0.2588 0.64594 0.26641 0.28009 -0.024625 0.63302 -0.317 0.10271 0.30886 0.097792 -0.38227 0.086552 0.047075 0.23511 -0.32127 -0.28538 0.1667 -0.0049707 -0.62714 -0.24904 0.29713 0.14379 -0.12325 -0.058178 -0.001029 -0.082126 0.36935 -0.00058442 0.34286 0.28426 -0.068599 0.65747 -0.029087 0.16184 0.073672 -0.30343 0.095733 -0.5286 -0.22898 0.064079 0.015218 0.34921 -0.4396 -0.43983 0.77515 -0.87767 -0.087504 0.39598 0.62362 -0.26211 -0.30539 -0.022964 0.30567 0.06766 0.15383 -0.11211 -0.09154 0.082562 0.16897 -0.032952 -0.28775 -0.2232 -0.090426 1.2407 -0.18244 -0.0075219 -0.041388 -0.011083 0.078186 0.38511 0.23334 0.14414 -0.0009107 -0.26388 -0.20481 0.10099 0.14076 0.28834 -0.045429 0.37247 0.13645 -0.67457 0.22786 0.12599 0.029091 0.030428 -0.13028 0.19408 0.49014 -0.39121 -0.075952 0.074731 0.18902 -0.16922 -0.26019 -0.039771 -0.24153 0.10875 0.30434 0.036009 1.4264 0.12759 -0.073811 -0.20418 0.0080016 0.15381 0.20223 0.28274 0.096206 -0.33634 0.50983 0.32625 -0.26535 0.374 -0.30388 -0.40033 -0.04291 -0.067897 -0.29332 0.10978 -0.045365 0.23222 -0.31134 -0.28983 -0.66687 0.53097 0.19461 0.3667 0.26185 -0.65187 0.10266 0.11363 -0.12953 -0.68246 -0.18751 0.1476 1.0765 -0.22908 -0.0093435 -0.20651 -0.35225 -0.2672 -0.0034307 0.25906 0.21759 0.66158 0.1218 0.19957 -0.20303 0.34474 -0.24328 0.13139 -0.0088767 0.33617 0.030591 0.25577, 0.17651 0.29208 -0.0020768 -0.37523 0.0049139 0.23979 -0.28893 -0.014643 -0.10993 0.15592 0.20627 0.47675 0.099907 -0.14058 0.21114 0.12126 -0.31831 -0.089433 -0.090553 -0.31962 0.21319 2.4844 -0.077521 -0.084279 0.20186 0.26084 -0.40411 -0.19127 0.24715 0.22394 -0.063437 0.20379 -0.18463 -0.088413 0.024169 -0.28769 -0.61246 -0.12683 -0.088273 0.18331 -0.53161 -0.1997 -0.26703 0.15312 -0.015239 -0.082844 0.47856 -0.29612 0.11168 -0.02579 -0.011697 0.19923 -0.14267 0.6625 -0.051739 -0.16938 -0.15635 0.092806 0.32548 0.11724 0.28788 -0.060651 -0.14153 0.16668 0.26861 -0.031001 -0.39665 0.35304 0.2385 0.12388 0.45698 -0.12559 -0.12804 0.37449 0.2446 0.23073 0.20808 0.051258 -0.21816 -0.036409 -0.0388 -0.042487 -0.30779 -0.025449 0.22532 0.045538 -0.48934 -0.13988 0.17394 -0.46137 -0.26555 0.15473 0.063816 -0.17022 -0.15762 0.075765 0.12151 -0.4934 -0.10909 0.034487 0.29947 0.01869 -0.16534 0.016679 0.16341 -0.27418 0.077797 1.4023 0.025275 0.094725 -0.040735 -0.10642 0.023364 0.079143 -0.16615 -0.23013 -0.14071 0.40159 -0.34951 0.018545 0.22434 0.76922 0.24722 0.14936 0.42368 -0.72059 -0.038541 0.15522 0.33596 -0.43077 -0.026925 -0.37733 0.24271 -0.46495 0.45783 0.23693 0.079361 -0.32244 -0.42434 -0.11138 0.55426 0.085153 -0.020581 -0.046386 1.2467 0.13177 0.067092 -0.5778 0.013586 -0.071274 0.017311 0.089781 0.19857 -0.032205 0.64843 -0.23797 -0.19676 0.20203 0.21074 -0.50347 0.026823 -0.045444 -0.22642 -0.19977 -0.12138 0.16941 0.061998 0.42631 -0.088383 0.45756 0.077774 0.061342 0.4571 -0.17787 -0.14597 0.32654 0.002443 -0.11886 0.10081 -0.020011 1.0366 -0.39814 -0.6818 0.23685 -0.20396 -0.17668 -0.31385 0.14834 -0.052187 0.0613 -0.32582 0.19153 -0.15469 -0.14679 0.046971 0.032325 -0.22006 -0.20774 -0.23189 -0.10814. 0.12289 0.58037 -0.069635 -0.50288 0.10503 0.39945 -0.38635 -0.084279 0.12219 0.080312 0.32337 0.47579 -0.038375 -0.00709 0.41524 0.32121 -0.21185 0.36144 -0.055623 -0.030512 0.42854 2.8547 -0.14623 -0.17557 0.31197 -0.13118 0.033298 0.13093 0.089889 -0.12417 0.0023396 -0.068954 -0.10754 -0.11551 -0.31052 -0.12097 -0.46691 -0.0836 -0.037664 -0.071779 -0.11899 -0.20381 -0.12424 0.46339 -0.19828 -0.0080365 0.53718 0.031739 0.34331 0.0079704 0.0048744 0.030592 -0.17615 0.82342 -0.13793 -0.10075 -0.12686 0.074735 -0.088719 -0.042719 0.076624 0.089263 0.064445 -0.031958 0.15254 -0.10384 0.076604 0.34099 0.24331 -0.10452 0.40714 -0.1826 -0.040667 0.50878 0.08076 0.22759 -0.042162 -0.18171 -0.095025 0.030334 0.088202 -3.9843e-06 -0.0039877 0.15724 0.33167 0.08471 -0.25919 -0.41384 0.2992 -0.54255 0.032129 0.1003 0.44202 0.044682 -0.090681 -0.10481 -0.1186 -0.31972 -0.2079 -0.040203 -0.022988 0.22824 0.0055238 0.12568 -0.1464 -0.14904 -0.11561 1.0517 -0.19498 0.083958 0.044812 -0.12965 -0.093468 0.21237 -0.088332 -0.1868 0.26521 0.13097 -0.048102 -0.22467 0.28412 0.34907 0.34833 0.017877 0.30504 -0.83453 0.048856 -0.1933 0.20764 -0.49701 -0.18747 -0.076801 0.15558 -0.46844 0.40944 0.21386 0.082392 -0.26491 -0.21224 -0.13293 0.14738 -0.14192 0.18994 -0.15587 1.0738 0.40789 -0.27452 -0.18431 0.00068679 -0.087115 0.19672 0.40918 -0.35462 -0.06326 0.4492 -0.060568 -0.041636 0.20531 0.017025 -0.58448 0.075441 0.082116 -0.46008 0.012393 -0.02531 0.14177 -0.092192 0.34505 -0.52136 0.57304 0.011973 0.033196 0.29672 -0.27899 0.19979 0.25666 0.082079 -0.078436 0.093719 0.24202 1.3495 -0.30434 -0.30936 0.42047 -0.079068 -0.14819 -0.089404 0.0668 0.22405 0.27226 -0.035236 0.17688 -0.0536 0.0070031 -0.033006 -0.080021 -0.24451 -0.039174 -0.16236 -0.096652
of 0.052924 0.25427 0.31353 -0.35613 0.029629 0.51034 -0.10716 0.15195 0.057698 0.06149 0.06116 0.39911 -0.00029018 0.31978 0.43257 -0.14708 0.054842 0.27079 -0.14051 -0.30101 0.16313 3.0013 0.22231 -0.14279 0.083705 0.089866 -0.52706 -0.089661 0.27311 0.31413 -0.04081 0.060557 -0.042656 0.24178 -0.29187 0.22575 -0.6298 -0.14641 -0.22429 -0.056621 -0.17776 -0.64269 0.51626 0.22305 0.12124 0.48074 0.41743 0.54805 0.40955 0.42407 0.049906 -0.32574 0.46298 0.19245 0.28143 0.2966 0.063593 -0.11906 -0.15016 -0.04984 0.40675 0.010675 -0.69127 0.048729 0.26391 0.30961 -0.11921 0.25548 -0.28219 -0.037413 0.36461 0.027129 0.20786 0.53325 0.50148 0.72381 0.065292 -0.078716 -0.10537 -0.08081 -0.2096 0.040902 -0.88101 0.24715 0.16146 0.10361 0.19705 -0.27365 0.89902 -0.29981 0.036165 0.041238 0.60105 -0.18911 -0.43887 -0.14097 0.44073 -0.19999 0.28834 -0.25458 -0.10985 -0.0027379 0.091735 0.17021 -0.16305 -0.57439 0.37063 1.7262 -0.24656 0.51681 -0.15355 -0.15553 0.019783 0.1803 0.38178 0.094443 -0.55158 -0.20242 -0.4386 -0.42108 0.27525 0.58977 0.026655 0.16401 0.13893 -0.68692 0.51071 0.29278 0.022041 -0.18156 -0.64905 0.16923 -0.01059 0.21785 -0.27242 0.27967 0.1395 -0.70559 -0.26034 -0.44017 0.15303 0.19693 -0.096838 0.14827 1.1294 -0.31267 0.0099916 -0.48623 0.080584 0.35608 -0.19925 0.19306 -0.2004 -0.44194 0.75766 0.24487 -0.18903 0.26653 -0.21339 -0.54083 0.40532 -0.02796 -0.13398 -0.11086 0.059506 0.24052 -0.59739 -0.0024069 -0.18593 1.042 -0.12969 0.20813 0.33305 -0.1278 0.085662 -0.076422 0.31407 -0.23784 -0.054838 0.011369 0.845 -0.34165 0.093983 0.082445 -0.27777 -0.44226 -0.063078 0.37274 0.054468 0.24197 -0.040886 0.3894 -0.10509 0.23372 0.096027 -0.30324 0.24488 -0.086254 -0.41917 0.46496
to 0.57346 0.5417 -0.23477 -0.3624 0.4037 0.11386 -0.44933 -0.30991 -0.0053411 0.58426 -0.025956 0.49393 -0.037209 -0.28428 0.097696 -0.48907 0.026027 0.37649 0.057788 -0.46807 0.081288 3.2825 -0.6369 0.37956 0.0038167 0.093607 -0.12855 0.1738 0.10522 0.28648 0.21089 -0.47076 0.027733 -0.19803 0.076328 -0.84629 -0.79708 -0.38743 -0.030422 -0.26849 0.48585 0.12895 0.38354 0.38722 -0.38524 0.19075 0.48998 0.13278 0.010792 0.2677 0.17812 -0.11433 -0.33494 0.87306 0.75875 -0.30378 -0.15626 0.0012085 0.23322 0.27953 -0.18494 -0.14146 -0.18969 -0.038386 0.35874 0.065513 0.060565 0.66339 -0.083252 0.065163 0.51761 0.16171 0.46011 0.16388 -0.12399 0.31122 -0.15412 -0.10917 -0.42551 0.11418 0.25137 -0.056158 -0.25927 0.28163 -0.018094 0.16065 -0.48506 -0.98903 0.25022 -0.16736 0.41474 0.17701 0.42407 0.11088 -0.1836 -0.1241 -0.3478 0.099078 -0.22381 -0.11245 -0.21156 0.0030706 -0.23607 0.027261 0.3643 0.039922 -0.18369 1.2266 -0.7764 -0.66225 0.015724 -0.14969 0.084649 0.26814 -0.16765 -0.31942 0.28494 -0.07 0.01201 -0.12219 0.5631 -0.32 0.50109 -0.10209 0.46575 -0.71542 0.17293 0.58259 0.078384 -0.033844 -0.25129 0.36503 0.031578 -0.65778 0.05475 0.87189 0.12455 -0.45877 -0.26965 -0.46779 -0.0028578 0.1781 0.63969 0.13995 0.97596 0.11836 -0.63904 -0.15416 0.065262 0.24329 0.66476 0.25069 -0.10252 -0.32839 -0.085559 -0.012774 -0.19431 0.56139 -0.35733 -0.20344 -0.12413 -0.34431 -0.23296 -0.21187 0.085387 0.070063 -0.19803 -0.026023 -0.39037 0.80002 0.40577 -0.079863 0.35263 -0.34043 0.39676 0.22862 -0.35028 -0.47344 0.59742 -0.11657 1.0552 -0.4157 -0.080552 -0.056571 -0.16622 0.19274 -0.095175 -0.20781 0.1562 0.050231 -0.27915 0.43742 -0.31237 0.13194 -0.33278 0.18877 -0.23422 0.54418 -0.23069 0.34947
and
'''
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs

embedding_dim = 200

# embedding_matrix will be the weight matrix that is provided for embedding layer in our model
embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, i in tokenizer.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector



In [None]:
print(embedding_matrix.shape)

In [None]:
def define_model(vocab_size, max_length):
  inputs1 = Input(shape=(4096,))
  fe1 = Dropout(0.5)(inputs1)
  fe2 = Dense(256, activation='relu')(fe1)

  inputs2 = Input(shape=(max_length, ))
  se1 = Embedding(vocab_size, embedding_dim, mask_zero=True)(inputs2)
  se2 = Dropout(0.5)(se1)
  se3 = LSTM(256)(se2)

  decoder1 = add([fe2, se3])
  decoder2 = Dense(256, activation='relu')(decoder1)
  outputs = Dense(vocab_size, activation='softmax')(decoder2)

  model = Model(inputs=[inputs1, inputs2], outputs=outputs)
  model.compile(loss='categorical_crossentropy', optimizer='adam')
  model.summary()
  plot_model(model, to_file='model.png', show_shapes=True)
  return model

# X1test, X2test, ytest = create_sequences(tokenizer, max_length, test_descriptions, test_features)
model = define_model(vocab_size, max_length)

model.layers[2].set_weights([embedding_matrix])
model.layers[2].trainable = False



# checkpoint = ModelCheckpoint('model.h5', monitor='val_loss', verbose=1, save_best_only=True, mode='min')

# model.fit([X1train, X2train], ytrain, epochs=20, verbose=2, callback=[checkpoint], validation_data=([X1test, X2test], ytest))
# train the model, run epochs manually and save after each epoch
epochs = 15
steps = len(train_descriptions)

for i in range(epochs):
	# create the data generator
	generator = data_generator(train_descriptions, train_features, tokenizer, max_length, vocab_size)
	# fit for one epoch
	model.fit_generator(generator, epochs=1, steps_per_epoch=steps, verbose=1)
	# save model
	model.save('model_' + str(i) + '.h5')

In [None]:
from numpy import argmax
from pickle import load
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import load_model
from nltk.translate.bleu_score import corpus_bleu

def word_for_id(integer, tokenizer):
  for word, index in tokenizer.word_index.items():
    if index == integer:
      return word
  return None

def generate_desc(model, tokenizer, photo, max_length):
  in_text = 'startseq'
  for _ in range(max_length):
    sequence: List[int] = tokenizer.texts_to_sequences([in_text])[0]
    sequence = pad_sequences([sequence], maxlen=max_length)
    yhat = model.predict([photo, sequence], verbose=0)
    yhat = argmax(yhat)
    word = word_for_id(yhat, tokenizer)
    if word is None:
      break
    in_text += ' ' + word

    if word == 'endseq':
      break
  return in_text


def cleanup_summary(summary):
  index = summary.find('startseq ')
  if index > -1:
    summary = summary[len('startseq '): ]
  index = summary.find(' endseq')
  if index > -1:
    summary = summary[:index]
  return summary

def evaluate_model(model, descriptions, photos, tokenizer, max_length):
  actual, predicted = list(), list()
  for key, desc_list in descriptions.items():
    yhat = generate_desc(model, tokenizer, photos[key], max_length)
    yhat = cleanup_summary(yhat)
    references = [cleanup_summary(d).split() for d in desc_list]
    actual.append(references)
    predicted.append(yhat.split())

  print("BLEU-1: %f" % corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0)))
  print("BLEU-2: %f" % corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0)))
  print("BLEU-3: %f" % corpus_bleu(actual, predicted, weights=(0.3, 0.3, 0.3, 0)))
  print("BLEU-4: %f" % corpus_bleu(actual, predicted, weights=(0.25, 0.25, 0.25, 0.25)))


# filename = '/content/Flickr_8k.testImages.txt'
filename = './Flickr_8k.testImages.txt'
test = load_set(filename)
print('Length of testing dataset: %d'% len(test))

test_descriptions = load_clean_descriptions('descriptions.txt', test)
print('Descriptions: test= %d' %len(test_descriptions))

test_features = load_photo_features('features.pkl', test)
print('Photos: test=%d' %len(test_features))

filename = './model_6.h5'
model = load_model(filename)
evaluate_model(model, test_descriptions, test_features, tokenizer, max_length)

In [None]:
import requests

response = requests.get("https://previews.123rf.com/images/lotosfoto/lotosfoto1711/lotosfoto171100342/89959642-little-girl-swinging-on-a-swing.jpg")

file = open("sample_image.jpg", "wb")
file.write(response.content)
file.close()

def extract_features(directory: Path, is_directory: bool = True)  -> Dict[str, NDArray[Shape["1, 4096"], Int]]:
  model = VGG16()
  # model.layers.pop() No effect

  model= Model(inputs=model.inputs, outputs=model.layers[-2].output)

  model.summary()
  features = dict()
  if not is_directory:
    image = load_img(directory, target_size=(224, 224))
    image = img_to_array(image)
    image = image.reshape(1, image.shape[0], image.shape[1], image.shape[2])
    image = preprocess_input(image) # doubt
    feature = model.predict(image, verbose=0)
    # print(type(feature), 'Shape: ', feature.shape)
    # <class 'numpy.ndarray'> Shape:  (1, 4096)
    image_id = directory.split('/')[-1]
    features[image_id] = feature
  else:
    for name in listdir(directory):
      filename = path.join(directory, name)
      image = load_img(filename, target_size=(224, 224))
      image = img_to_array(image)
      image = image.reshape(1, image.shape[0], image.shape[1], image.shape[2])
      image = preprocess_input(image) # doubt
      feature = model.predict(image, verbose=0)
      # print(type(feature), 'Shape: ', feature.shape)
      # <class 'numpy.ndarray'> Shape:  (1, 4096)
      image_id = name.split('.')[0]
      features[image_id] = feature
      print('>%s' % name)
  return features

photo = extract_features("sample_image.jpg", False)
description = generate_desc(model, tokenizer, photo, max_length)
description = cleanup_summary(description)

print(description)

In [None]:
# Ref: https://www.kaggle.com/code/basel99/image-caption-generator/notebook