In [1]:
# ! gdown "https://drive.google.com/uc?id=1q6PVD25pT06-Dj-txWX3sBRrLP1bUniK&export=download"
# ! gdown "https://drive.google.com/uc?id=16VDuLqCsig7eoBRij0lEpW4F8aPzAmr6&export=download"

In [1]:
import tensorflow as tf

from model import CaptionGenerator
#from dataset import prepare_train_data, prepare_eval_data, prepare_test_data

In [2]:
import pandas as pd
import numpy as np
import glob

In [3]:
!ls

Developement.ipynb  config.py	model.py	    test       vgg16_no_fc.npy
LICENSE.md	    dataset.py	models		    train      vocabulary.csv
README.md	    eval.sh	new_data	    train_new
__pycache__	    examples	resnet50_no_fc.npy  utils
base_model.py	    main.py	summary		    val


In [63]:
df = pd.read_csv('train_new/datasetpip.csv')
df = df.dropna(axis=0)
df['image_id'] = df['Product ID'].astype(np.int32)
df['caption'] = df['Description']
productid_list = df['image_id'].unique()
df = df[['image_id','caption']]

In [64]:
df['image_file'] =  None

In [65]:
df['image_file'] = './train_new/images/' + df['image_id'].astype(str) + '.jpg'

In [66]:
df.head(2)

Unnamed: 0,image_id,caption,image_file
0,2951351,The first thing people will look at when they ...,./train_new/images/2951351.jpg
1,3428371,Travel back in time with our new Official Esta...,./train_new/images/3428371.jpg


In [67]:
def check_files(df):
    return_type = True
    if (len(glob.glob(df['image_file']))==0 ):
        return_type = False
    return return_type    

In [68]:
df['status'] = df.apply(check_files,axis=1)
df['status'].value_counts()

True    98541
Name: status, dtype: int64

In [69]:
df.drop(['status'],axis=1,inplace=True)

In [70]:
df.head(1)

Unnamed: 0,image_id,caption,image_file
0,2951351,The first thing people will look at when they ...,./train_new/images/2951351.jpg


In [71]:
df['caption'].iloc[0]

"The first thing people will look at when they see you isn't you. It will be you wearing this Notre Dame Fighting Irish NCAA Basic 59FIFTY GCP fitted hat from New Era. That's because they see a person who's loud and proud about the Notre Dame Fighting Irish from a mile away. As a true fan, that's something you want your team, your school and the world to know."

In [72]:
#df['caption'].str.split('.').apply(len)

In [129]:
df[df['caption'].str.split(' ').apply(len) > 100].shape

(1003, 3)

In [73]:
df.shape

(98541, 3)

In [74]:
df_json = {'annotations':[]}

In [75]:
df_json_list = []
for index, row in df.iterrows():
    df_json_list.append({'image_id': int(row['image_id']),'image_file': row['image_file'],'caption':row['caption'] })

In [76]:
df_json['annotations'] = df_json_list

In [77]:
#df_json['annotations']

In [78]:
#df.to_csv('./train_new/anns.csv',index=False)

## Config File

In [12]:
class Config(object):
    """ Wrapper class for various (hyper)parameters. """
    def __init__(self):
        # about the model architecture
        self.cnn = 'vgg16'               # 'vgg16' or 'resnet50'
        self.max_caption_length = 20
        self.dim_embedding = 512
        self.num_lstm_units = 512
        self.num_initalize_layers = 2    # 1 or 2
        self.dim_initalize_layer = 512
        self.num_attend_layers = 2       # 1 or 2
        self.dim_attend_layer = 512
        self.num_decode_layers = 2       # 1 or 2
        self.dim_decode_layer = 1024

        # about the weight initialization and regularization
        self.fc_kernel_initializer_scale = 0.08
        self.fc_kernel_regularizer_scale = 1e-4
        self.fc_activity_regularizer_scale = 0.0
        self.conv_kernel_regularizer_scale = 1e-4
        self.conv_activity_regularizer_scale = 0.0
        self.fc_drop_rate = 0.5
        self.lstm_drop_rate = 0.3
        self.attention_loss_factor = 0.01

        # about the optimization
        self.num_epochs = 100
        self.batch_size = 32
        self.optimizer = 'Adam'    # 'Adam', 'RMSProp', 'Momentum' or 'SGD'
        self.initial_learning_rate = 0.0001
        self.learning_rate_decay_factor = 1.0
        self.num_steps_per_decay = 100000
        self.clip_gradients = 5.0
        self.momentum = 0.0
        self.use_nesterov = True
        self.decay = 0.9
        self.centered = True
        self.beta1 = 0.9
        self.beta2 = 0.999
        self.epsilon = 1e-6

        # about the saver
        self.save_period = 1000
        self.save_dir = './models/'
        self.summary_dir = './summary/'

        # about the vocabulary
        self.vocabulary_file = './vocabulary.csv'
        self.vocabulary_size = 5000

        # about the training
        self.train_image_dir = './train_new/images/'
        self.train_caption_file = './train/captions_train2014.json'
        self.temp_annotation_file = './train_new/anns.csv'
        self.temp_data_file = './train/data.npy'

        # about the evaluation
        self.eval_image_dir = './val/images/'
        self.eval_caption_file = './val/captions_val2014.json'
        self.eval_result_dir = './val/results/'
        self.eval_result_file = './val/results.json'
        self.save_eval_result_as_image = False

        # about the testing
        self.test_image_dir = './test/images/'
        self.test_result_dir = './test/results/'
        self.test_result_file = './test/results.csv'


### Initialize Config file

In [13]:
config = Config()
config.phase = 'train'
config.train_cnn = True
config.beam_size = 3

## Dataset Create

In [14]:
import os
import math
import numpy as np
import pandas as pd
from tqdm import tqdm

from utils.coco.coco import COCO
from utils.vocabulary import Vocabulary

class DataSet(object):
    def __init__(self,
                 image_ids,
                 image_files,
                 batch_size,
                 word_idxs=None,
                 masks=None,
                 is_train=False,
                 shuffle=False):
        self.image_ids = np.array(image_ids)
        self.image_files = np.array(image_files)
        self.word_idxs = np.array(word_idxs)
        self.masks = np.array(masks)
        self.batch_size = batch_size
        self.is_train = is_train
        self.shuffle = shuffle
        self.setup()

    def setup(self):
        """ Setup the dataset. """
        self.count = len(self.image_ids)
        self.num_batches = int(np.ceil(self.count * 1.0 / self.batch_size))
        self.fake_count = self.num_batches * self.batch_size - self.count
        self.idxs = list(range(self.count))
        self.reset()

    def reset(self):
        """ Reset the dataset. """
        self.current_idx = 0
        if self.shuffle:
            np.random.shuffle(self.idxs)

    def next_batch(self):
        """ Fetch the next batch. """
        assert self.has_next_batch()

        if self.has_full_next_batch():
            start, end = self.current_idx, \
                         self.current_idx + self.batch_size
            current_idxs = self.idxs[start:end]
        else:
            start, end = self.current_idx, self.count
            current_idxs = self.idxs[start:end] + \
                           list(np.random.choice(self.count, self.fake_count))

        image_files = self.image_files[current_idxs]
        if self.is_train:
            word_idxs = self.word_idxs[current_idxs]
            masks = self.masks[current_idxs]
            self.current_idx += self.batch_size
            return image_files, word_idxs, masks
        else:
            self.current_idx += self.batch_size
            return image_files

    def has_next_batch(self):
        """ Determine whether there is a batch left. """
        return self.current_idx < self.count

    def has_full_next_batch(self):
        """ Determine whether there is a full batch left. """
        return self.current_idx + self.batch_size <= self.count


In [15]:
config.train_caption_file

'./train/captions_train2014.json'

In [15]:
def prepare_train_data(config):
    """ Prepare the data for training the model. """
    coco = COCO(config.train_caption_file)
    coco.filter_by_cap_len(config.max_caption_length)

    print("Building the vocabulary...")
    vocabulary = Vocabulary(config.vocabulary_size)
    if not os.path.exists(config.vocabulary_file):
        vocabulary.build(coco.all_captions())
        vocabulary.save(config.vocabulary_file)
    else:
        vocabulary.load(config.vocabulary_file)
    print("Vocabulary built.")
    print("Number of words = %d" %(vocabulary.size))

    coco.filter_by_words(set(vocabulary.words))

    print("Processing the captions...")
    if not os.path.exists(config.temp_annotation_file):
        captions = [coco.anns[ann_id]['caption'] for ann_id in coco.anns]
        image_ids = [coco.anns[ann_id]['image_id'] for ann_id in coco.anns]
        image_files = [os.path.join(config.train_image_dir,
                                    coco.imgs[image_id]['file_name'])
                                    for image_id in image_ids]
        annotations = pd.DataFrame({'image_id': image_ids,
                                    'image_file': image_files,
                                    'caption': captions})
        annotations.to_csv(config.temp_annotation_file)
    else:
        annotations = pd.read_csv(config.temp_annotation_file)
        captions = annotations['caption'].values
        image_ids = annotations['image_id'].values
        image_files = annotations['image_file'].values

    if not os.path.exists(config.temp_data_file):
        word_idxs = []
        masks = []
        for caption in tqdm(captions):
            current_word_idxs_ = vocabulary.process_sentence(caption)
            current_num_words = len(current_word_idxs_)
            current_word_idxs = np.zeros(config.max_caption_length,
                                         dtype = np.int32)
            current_masks = np.zeros(config.max_caption_length)
            current_word_idxs[:current_num_words] = np.array(current_word_idxs_)
            current_masks[:current_num_words] = 1.0
            word_idxs.append(current_word_idxs)
            masks.append(current_masks)
        word_idxs = np.array(word_idxs)
        masks = np.array(masks)
        data = {'word_idxs': word_idxs, 'masks': masks}
        np.save(config.temp_data_file, data)
    else:
        data = np.load(config.temp_data_file, encoding='latin1',allow_pickle=True).item()
        word_idxs = data['word_idxs']
        masks = data['masks']
    print("Captions processed.")
    print("Number of captions = %d" %(len(captions)))

    print("Building the dataset...")
    dataset = DataSet(image_ids,
                      image_files,
                      config.batch_size,
                      word_idxs,
                      masks,
                      True,
                      True)
    print("Dataset built.")
    return dataset


In [16]:
config.train_caption_file

'./train/captions_train2014.json'

In [23]:
def build_vocabulary(config):
    """ Build the vocabulary from the training data and save it to a file. """
    coco = COCO(config.train_caption_file)
    coco.filter_by_cap_len(config.max_caption_length)

    vocabulary = Vocabulary(config.vocabulary_size)
    vocabulary.build(coco.all_captions())
    vocabulary.save(config.vocabulary_file)
    return vocabulary

In [24]:
config.train_caption_file

'./train/captions_train2014.json'

In [25]:
coco = COCO(config.train_caption_file)

loading annotations into memory...
Done (t=1.03s)
creating index...
index created!


In [26]:
import json

In [40]:
json_string = json.dumps('./train/captions_train2014.json')

In [27]:
with open('./train/captions_train2014.json', 'r') as f:
    datastore = json.load(f)

## Prepare train data

### Coco clean

In [29]:
config.train_caption_file

'./train/captions_train2014.json'

In [86]:
dataset = json.load(open(config.train_caption_file, 'r'))

In [32]:
df.head(1)

Unnamed: 0,image_id,caption,image_file
0,2951351,The first thing people will look at when they ...,./train_new/images/2951351.jpg


In [85]:
df_json['annotations']

[{'image_id': 2951351,
  'image_file': './train_new/images/2951351.jpg',
  'caption': "the first thing people will look at when they see you isn't you it will be you wearing this notre dame fighting irish ncaa basic 59fifty gcp fitted hat from new era that's because they see a person who's loud and proud about the notre dame fighting irish from a mile away as a true fan, that's something you want your team, your school and the world to know."},
 {'image_id': 3428371,
  'image_file': './train_new/images/3428371.jpg',
  'caption': "travel back in time with our new official established collection that the players and staff will wear on the nfl sidelines this season this official nfl sideline home 9fifty is the ultimate adjustable snapback silhouette with a flat visor and classic cap shape this cap features your team's established date embroidered on the right side, a team patch on the front, and the official nfl 100 logo on the rear this cap will take you back to the days when your team o

In [31]:
dataset['annotations']

[{'image_id': 318556,
  'id': 48,
  'caption': 'A very clean and well decorated empty bathroom'},
 {'image_id': 116100,
  'id': 67,
  'caption': 'A panoramic view of a kitchen and all of its appliances.'},
 {'image_id': 318556,
  'id': 126,
  'caption': 'A blue and white bathroom with butterfly themed wall tiles.'},
 {'image_id': 116100,
  'id': 148,
  'caption': 'A panoramic photo of a kitchen and dining room'},
 {'image_id': 379340,
  'id': 173,
  'caption': 'A graffiti-ed stop sign across the street from a red car '},
 {'image_id': 379340,
  'id': 188,
  'caption': 'A vandalized stop sign and a red beetle on the road'},
 {'image_id': 318556,
  'id': 219,
  'caption': 'A bathroom with a border of butterflies and blue paint on the walls above it.'},
 {'image_id': 318556,
  'id': 255,
  'caption': 'An angled view of a beautifully decorated bathroom.'},
 {'image_id': 134754,
  'id': 272,
  'caption': 'The two people are walking down the beach.'},
 {'image_id': 538480,
  'id': 288,
  'ca

In [87]:
def process_dataset():
    for ann in dataset['annotations']:
        q = ann['caption'].lower()
        if q[-1]!='.':
            q = q + '.'
        ann['caption'] = q

process_dataset()

In [84]:
def process_dataset_new():
    
    for ann in df_json['annotations']:
        q = ann['caption'].lower()
        q = q.replace('.','')
        
        #if q[-1]!='.':
        #    q = q + '.'
        q = q + '.'
        ann['caption'] = q

process_dataset_new()

In [104]:
def createIndex():
        # create index
        print ('creating index...')
        anns = {}
        imgToAnns = {}
        imgs = {}
        img_name_to_id = {}

        if 'annotations' in dataset:
            imgToAnns = {ann['image_id']: [] for ann in dataset['annotations']}
            anns =      {ann['id']:       [] for ann in dataset['annotations']}
            for ann in dataset['annotations']:
                imgToAnns[ann['image_id']] += [ann]
                anns[ann['id']] = ann

        if 'images' in dataset:
            imgs = {im['id']: {} for im in dataset['images']}
            for img in dataset['images']:
                imgs[img['id']] = img
                img_name_to_id[img['file_name']] = img['id']


        print ('index created!')      
        return anns,imgToAnns,imgs,img_name_to_id
    
anns,imgToAnns,imgs,img_name_to_id = createIndex()    

In [105]:
anns,imgToAnns,imgs,img_name_to_id = createIndex()

creating index...
index created!


In [151]:
dataset['images']

[{'license': 5,
  'file_name': 'COCO_train2014_000000057870.jpg',
  'coco_url': 'http://images.cocodataset.org/train2014/COCO_train2014_000000057870.jpg',
  'height': 480,
  'width': 640,
  'date_captured': '2013-11-14 16:28:13',
  'flickr_url': 'http://farm4.staticflickr.com/3153/2970773875_164f0c0b83_z.jpg',
  'id': 57870},
 {'license': 5,
  'file_name': 'COCO_train2014_000000384029.jpg',
  'coco_url': 'http://images.cocodataset.org/train2014/COCO_train2014_000000384029.jpg',
  'height': 429,
  'width': 640,
  'date_captured': '2013-11-14 16:29:45',
  'flickr_url': 'http://farm3.staticflickr.com/2422/3577229611_3a3235458a_z.jpg',
  'id': 384029},
 {'license': 1,
  'file_name': 'COCO_train2014_000000222016.jpg',
  'coco_url': 'http://images.cocodataset.org/train2014/COCO_train2014_000000222016.jpg',
  'height': 640,
  'width': 480,
  'date_captured': '2013-11-14 16:37:59',
  'flickr_url': 'http://farm2.staticflickr.com/1431/1118526611_09172475e5_z.jpg',
  'id': 222016},
 {'license': 3

In [149]:
anns

{48: {'image_id': 318556,
  'id': 48,
  'caption': 'a very clean and well decorated empty bathroom.'},
 67: {'image_id': 116100,
  'id': 67,
  'caption': 'a panoramic view of a kitchen and all of its appliances.'},
 126: {'image_id': 318556,
  'id': 126,
  'caption': 'a blue and white bathroom with butterfly themed wall tiles.'},
 148: {'image_id': 116100,
  'id': 148,
  'caption': 'a panoramic photo of a kitchen and dining room.'},
 173: {'image_id': 379340,
  'id': 173,
  'caption': 'a graffiti-ed stop sign across the street from a red car .'},
 188: {'image_id': 379340,
  'id': 188,
  'caption': 'a vandalized stop sign and a red beetle on the road.'},
 219: {'image_id': 318556,
  'id': 219,
  'caption': 'a bathroom with a border of butterflies and blue paint on the walls above it.'},
 255: {'image_id': 318556,
  'id': 255,
  'caption': 'an angled view of a beautifully decorated bathroom.'},
 272: {'image_id': 134754,
  'id': 272,
  'caption': 'the two people are walking down the bea

In [148]:
imgs

{57870: {'license': 5,
  'file_name': 'COCO_train2014_000000057870.jpg',
  'coco_url': 'http://images.cocodataset.org/train2014/COCO_train2014_000000057870.jpg',
  'height': 480,
  'width': 640,
  'date_captured': '2013-11-14 16:28:13',
  'flickr_url': 'http://farm4.staticflickr.com/3153/2970773875_164f0c0b83_z.jpg',
  'id': 57870},
 384029: {'license': 5,
  'file_name': 'COCO_train2014_000000384029.jpg',
  'coco_url': 'http://images.cocodataset.org/train2014/COCO_train2014_000000384029.jpg',
  'height': 429,
  'width': 640,
  'date_captured': '2013-11-14 16:29:45',
  'flickr_url': 'http://farm3.staticflickr.com/2422/3577229611_3a3235458a_z.jpg',
  'id': 384029},
 222016: {'license': 1,
  'file_name': 'COCO_train2014_000000222016.jpg',
  'coco_url': 'http://images.cocodataset.org/train2014/COCO_train2014_000000222016.jpg',
  'height': 640,
  'width': 480,
  'date_captured': '2013-11-14 16:37:59',
  'flickr_url': 'http://farm2.staticflickr.com/1431/1118526611_09172475e5_z.jpg',
  'id': 

In [146]:
img_name_to_id

{'COCO_train2014_000000057870.jpg': 57870,
 'COCO_train2014_000000384029.jpg': 384029,
 'COCO_train2014_000000222016.jpg': 222016,
 'COCO_train2014_000000520950.jpg': 520950,
 'COCO_train2014_000000069675.jpg': 69675,
 'COCO_train2014_000000547471.jpg': 547471,
 'COCO_train2014_000000122688.jpg': 122688,
 'COCO_train2014_000000392136.jpg': 392136,
 'COCO_train2014_000000398494.jpg': 398494,
 'COCO_train2014_000000090570.jpg': 90570,
 'COCO_train2014_000000504616.jpg': 504616,
 'COCO_train2014_000000161919.jpg': 161919,
 'COCO_train2014_000000457732.jpg': 457732,
 'COCO_train2014_000000044404.jpg': 44404,
 'COCO_train2014_000000004428.jpg': 4428,
 'COCO_train2014_000000170558.jpg': 170558,
 'COCO_train2014_000000405613.jpg': 405613,
 'COCO_train2014_000000283524.jpg': 283524,
 'COCO_train2014_000000037015.jpg': 37015,
 'COCO_train2014_000000071631.jpg': 71631,
 'COCO_train2014_000000491269.jpg': 491269,
 'COCO_train2014_000000365363.jpg': 365363,
 'COCO_train2014_000000064460.jpg': 6446

In [102]:
imgs

{57870: {'license': 5,
  'file_name': 'COCO_train2014_000000057870.jpg',
  'coco_url': 'http://images.cocodataset.org/train2014/COCO_train2014_000000057870.jpg',
  'height': 480,
  'width': 640,
  'date_captured': '2013-11-14 16:28:13',
  'flickr_url': 'http://farm4.staticflickr.com/3153/2970773875_164f0c0b83_z.jpg',
  'id': 57870},
 384029: {'license': 5,
  'file_name': 'COCO_train2014_000000384029.jpg',
  'coco_url': 'http://images.cocodataset.org/train2014/COCO_train2014_000000384029.jpg',
  'height': 429,
  'width': 640,
  'date_captured': '2013-11-14 16:29:45',
  'flickr_url': 'http://farm3.staticflickr.com/2422/3577229611_3a3235458a_z.jpg',
  'id': 384029},
 222016: {'license': 1,
  'file_name': 'COCO_train2014_000000222016.jpg',
  'coco_url': 'http://images.cocodataset.org/train2014/COCO_train2014_000000222016.jpg',
  'height': 640,
  'width': 480,
  'date_captured': '2013-11-14 16:37:59',
  'flickr_url': 'http://farm2.staticflickr.com/1431/1118526611_09172475e5_z.jpg',
  'id': 

In [None]:
def createIndex_new():
        # create index
        print ('creating index...')
        anns = {}
        imgToAnns = {}
        imgs = {}
        img_name_to_id = {}

        if 'annotations' in dataset:
            imgToAnns = {ann['image_id']: [] for ann in dataset['annotations']}
            anns =      {ann['id']:       [] for ann in dataset['annotations']}
            for ann in dataset['annotations']:
                imgToAnns[ann['image_id']] += [ann]
                anns[ann['id']] = ann

        if 'images' in dataset:
            imgs = {im['id']: {} for im in dataset['images']}
            for img in dataset['images']:
                imgs[img['id']] = img
                img_name_to_id[img['file_name']] = img['id']


        print ('index created!')      
        return anns,imgToAnns,imgs,img_name_to_id
anns,imgToAnns,imgs,img_name_to_id = createIndex_new()    

### filter_by_cap_len

In [107]:
coco.filter_by_cap_len(config.max_caption_length)

  0%|          | 622/414113 [00:00<01:06, 6212.79it/s]

Filtering the captions by length...


100%|██████████| 414113/414113 [01:01<00:00, 6707.73it/s]


creating index...
index created!


In [109]:
config.max_caption_length

20

In [132]:
df[df['caption'].str.split(' ').apply(len) > 100].shape

(1003, 3)

In [136]:
len(dataset['annotations'])

414113

In [140]:
from nltk.tokenize import word_tokenize

def filter_by_cap_len(max_cap_len):
        print("Filtering the captions by length...")
        keep_ann = {}
        keep_img = {}
        for ann in tqdm(dataset['annotations']):
            if len(word_tokenize(ann['caption']))<=max_cap_len:
                keep_ann[ann['id']] = keep_ann.get(ann['id'], 0) + 1
                keep_img[ann['image_id']] = keep_img.get(ann['image_id'], 0) + 1
                
        dataset['annotations'] = [ann for ann in dataset['annotations'] if keep_ann.get(ann['id'],0)>0]
        dataset['images'] = [img for img in dataset['images'] if keep_img.get(img['id'],0)>0]

        anns,imgToAnns,imgs,img_name_to_id = createIndex()
        return anns,imgToAnns,imgs,img_name_to_id,keep_ann,keep_img

In [141]:
anns,imgToAnns,imgs,img_name_to_id,keep_ann,keep_img = filter_by_cap_len(20)

  0%|          | 1350/409884 [00:00<01:00, 6709.94it/s]

Filtering the captions by length...


100%|██████████| 409884/409884 [00:59<00:00, 6848.38it/s]


creating index...
index created!


In [143]:
len(keep_ann)

409884

In [144]:
len(keep_img)

82783

In [None]:
df_json['annotations']

In [None]:
def filter_by_cap_len_new(max_cap_len):
        print("Filtering the captions by length...")
        keep_ann = {}
        keep_img = {}
        for ann in tqdm(df_json['annotations']):
            if len(word_tokenize(ann['caption']))<=max_cap_len:
                keep_img[ann['image_id']] = keep_img.get(ann['image_id'], 0) + 1
                
        df_json['annotations'] = [ann for ann in df_json['annotations'] if keep_ann.get(ann['id'],0)>0]
        #df_json['images'] = [img for img in dataset['images'] if keep_img.get(img['id'],0)>0]

        anns,imgToAnns,imgs,img_name_to_id = createIndex_new()


### filter_by_words

In [None]:
 def filter_by_words(self, vocab):
        print("Filtering the captions by words...")
        keep_ann = {}
        keep_img = {}
        for ann in tqdm(self.dataset['annotations']):
            keep_ann[ann['id']] = 1
            words_in_ann = word_tokenize(ann['caption'])
            for word in words_in_ann:
                if word not in vocab:
                    keep_ann[ann['id']] = 0
                    break
            keep_img[ann['image_id']] = keep_img.get(ann['image_id'], 0) + 1

        self.dataset['annotations'] = \
            [ann for ann in self.dataset['annotations'] \
            if keep_ann.get(ann['id'],0)>0]
        self.dataset['images'] = \
            [img for img in self.dataset['images'] \
            if keep_img.get(img['id'],0)>0]

        self.createIndex()`

## Run all the functions

In [32]:
with tf.Session() as sess:
    
    # training phase
    data = prepare_train_data(config)
    model = CaptionGenerator(config)
    sess.run(tf.global_variables_initializer())
    if FLAGS.load:
        model.load(sess, FLAGS.model_file)
    if FLAGS.load_cnn:
        model.load_cnn(sess, FLAGS.cnn_model_file)
    tf.get_default_graph().finalize()
    model.train(sess, data)

loading annotations into memory...
Done (t=0.72s)
creating index...


  0%|          | 655/414113 [00:00<01:03, 6545.39it/s]

index created!
Filtering the captions by length...


 12%|█▏        | 50924/414113 [00:07<00:51, 7049.76it/s]


KeyboardInterrupt: 