# Training

## Prepare training and validation datasets.

In [None]:
from utils.dataset import MSCOCO, Vocabulary

In [None]:
dataset_path = '../datasets/'
ms_coco_train_caption_file = (
    dataset_path + 'ms_coco/annotations/captions_train2014.json'
) 
ms_coco_train_image_dir = (
    dataset_path + 'ms_coco/train2014/'
)
training_dataset = MSCOCO(
    caption_file=ms_coco_train_caption_file,
    image_dir=ms_coco_train_image_dir,
)

In [None]:
training_dataset.get_size()

In [None]:
ms_coco_val_caption_file = (
    dataset_path + 'ms_coco/annotations/captions_val2014.json'
) 
ms_coco_val_image_dir = (
    dataset_path + 'ms_coco/val2014/'
)
validation_dataset = MSCOCO(
    caption_file=ms_coco_val_caption_file,
    image_dir=ms_coco_val_image_dir,
)

In [None]:
validation_dataset.get_size()

## Build vocabulary from the training dataset.

In [None]:
vocabulary = Vocabulary(
    min_word_count=3,
    dataset=training_dataset,
)

In [None]:
vocabulary.get_size()

In [None]:
vocabulary.save('vocabs/ms_coco_vocabulary.json')

## Train the model.

In [None]:
from tf_model.model import Image2Text

In [None]:
img2txt = Image2Text(
    training_dataset=training_dataset,
    validation_dataset=validation_dataset,
    vocabulary_file_path='vocabs/ms_coco_vocabulary.json',
    config_file_path='default_config.json',
)

In [None]:
rd = img2txt.train(
    max_num_steps=10 ** 2,
)

## Load a saved model and do an additional training.

In [None]:
img2txt = Image2Text(
    training_dataset=training_dataset,
    validation_dataset=validation_dataset,
    vocabulary_file_path='ms_coco_vocabulary.json',
    config_file_path='0822_021734/config.json',
    checkpoint_save_path='0822_021734/img2txt-100',
)

In [None]:
rd = img2txt.train(
    additional_num_steps=100,
)