# Caption AI
---

## Train Model

In [1]:
import math
import pytz
import torch
import torch.nn as nn
from datetime import datetime
from pycocotools.coco import COCO
from data_loader import get_loader
from torchvision import transforms
from model import EncoderCNN, DecoderRNN

In [2]:
## TODO #1: Select appropriate values for the Python variables below.
batch_size = 256          # batch size
vocab_threshold = 5        # minimum word count threshold
vocab_from_file = True    # if True, load existing vocab file
embed_size = 1024           # dimensionality of image and word embeddings
hidden_size = 512          # number of features in hidden state of the RNN decoder
save_every = 1             # determines frequency of saving model weights
print_every = 100          # determines window for printing average loss
log_file = 'training_log.txt'       # name of file with saved training loss and perplexity

tz = 'Asia/Kolkata'
localFormat = "%Y-%m-%d %H:%M:%S"

In [3]:
transform_train = transforms.Compose([ 
    transforms.Resize(256),                          # smaller edge of image resized to 256
    transforms.RandomCrop(224),                      # get 224x224 crop from random location
    transforms.RandomHorizontalFlip(),               # horizontally flip image with probability=0.5
    transforms.ToTensor(),                           # convert the PIL Image to a tensor
    transforms.Normalize((0.485, 0.456, 0.406),      # normalize image for pre-trained model
                         (0.229, 0.224, 0.225))])

transform_test = transforms.Compose([transforms.Resize(224),
                                     transforms.CenterCrop(224),
                                     transforms.ToTensor(),
                                     transforms.Normalize((0.485, 0.456, 0.406),
                                                          (0.229, 0.224, 0.225))])

In [4]:
# Build data loader.
data_loader = get_loader(transform=transform_train,
                         mode='train',
                         batch_size=batch_size,
                         vocab_threshold=vocab_threshold,
                         vocab_from_file=vocab_from_file)

# Build val data loader.
val_data_loader = get_loader(transform=transform_test,
                         mode='val',
                         batch_size=batch_size,
                         vocab_threshold=vocab_threshold,
                         vocab_from_file=vocab_from_file)
# The size of the vocabulary.
vocab_size = len(data_loader.dataset.vocab)

COCO location: /home/jupyter/Caption-AI/ImageCaptioning/COCODataset
Loading train Images form: /home/jupyter/Caption-AI/ImageCaptioning/COCODataset/images/train2014
Loading train Annotarions form: /home/jupyter/Caption-AI/ImageCaptioning/COCODataset/annotations/captions_train2014.json
Vocabulary successfully loaded from vocab.pkl file!
loading annotations into memory...
Done (t=0.71s)
creating index...


  0%|          | 822/414113 [00:00<00:50, 8217.07it/s]

index created!
Obtaining caption lengths...


100%|██████████| 414113/414113 [00:42<00:00, 9850.20it/s] 


COCO location: /home/jupyter/Caption-AI/ImageCaptioning/COCODataset
Loading val Images form: /home/jupyter/Caption-AI/ImageCaptioning/COCODataset/images/val2014
Loading val Annotarions form: /home/jupyter/Caption-AI/ImageCaptioning/COCODataset/annotations/captions_val2014.json
Vocabulary successfully loaded from vocab.pkl file!
loading annotations into memory...


  0%|          | 903/202654 [00:00<00:22, 9021.15it/s]

Done (t=0.29s)
creating index...
index created!
Obtaining caption lengths...


100%|██████████| 202654/202654 [00:20<00:00, 9654.70it/s] 


In [5]:
# Initialize the encoder and decoder. 
encoder = EncoderCNN(embed_size)
decoder = DecoderRNN(embed_size, hidden_size, vocab_size, num_layers=3, drop=0.5)

# Move models to GPU if CUDA is available. 
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
encoder.to(device)
decoder.to(device)

DecoderRNN(
  (embed): Embedding(9955, 1024)
  (lstm): LSTM(1024, 512, num_layers=3, batch_first=True, dropout=0.5)
  (drop): Dropout(p=0.5, inplace=False)
  (fc): Linear(in_features=512, out_features=9955, bias=True)
)

In [6]:
# Define the loss function. 
criterion = nn.CrossEntropyLoss().cuda() if torch.cuda.is_available() else nn.CrossEntropyLoss()

# TODO #3: Specify the learnable parameters of the model.
params = list(decoder.parameters()) + list(encoder.embed.parameters()) 

# TODO #4: Define the optimizer.
optimizer = torch.optim.Adam(params=params, lr = 0.001)

# Set the total number of training steps per epoch.
total_step = math.ceil(len(data_loader.dataset.caption_lengths) / data_loader.batch_sampler.batch_size)

### Optional

In [None]:
images, captions = next(iter(data_loader))

In [None]:
captions.dtype

In [7]:
import os
encoder_file = 'encoder-500.pkl'
decoder_file = 'decoder-500.pkl'
optim_file = 'optim-500.pkl'

# Load pre-trained weights before resuming training.
encoder.load_state_dict(torch.load(os.path.join('./modelsX', encoder_file)))
decoder.load_state_dict(torch.load(os.path.join('./modelsX', decoder_file)))
optimizer.load_state_dict(torch.load(os.path.join('./modelsX', optim_file)))

In [None]:
import torch.utils.data as data
import numpy as np
import os
import time

# Adding the Previous epochs
previous_epoch = 20
num_epochs = 25

valid_loss_min = np.Inf # track change in validation loss

# Open the training log file.
# f = open(log_file, 'w')
# f.close()

for epoch in range(1+previous_epoch, num_epochs+1):
    # keep track of training and validation loss
    train_loss = 0.0
    
    for i_step in range(1, total_step+1):

        decoder.train()
        encoder.train()
        
        # Randomly sample a caption length, and sample indices with that length.
        indices = data_loader.dataset.get_train_indices()
        # Create and assign a batch sampler to retrieve a batch with the sampled indices.
        new_sampler = data.sampler.SubsetRandomSampler(indices=indices)
        data_loader.batch_sampler.sampler = new_sampler
        
        # Obtain the batch.
        images, captions = next(iter(data_loader))

        # Move batch of images and captions to GPU if CUDA is available.
        images = images.to(device)
        captions = captions.to(device)
        
        # Zero the gradients.
        decoder.zero_grad()
        encoder.zero_grad()
        
        # Pass the inputs through the CNN-RNN model.
        features = encoder(images)
        outputs = decoder(features, captions)
        
        # Calculate the batch loss.
        loss = criterion(outputs.view(-1, vocab_size), captions.view(-1))
        
        # Backward pass.
        loss.backward()
        
        # Update the parameters in the optimizer.
        nn.utils.clip_grad_norm_(params, 5)
        optimizer.step()
        
        # update training loss
        train_loss += loss.item()
        
        # Print training statistics (on different line).
        if i_step % print_every == 0:
            
            decoder.train()
            encoder.train()

            indices_v = val_data_loader.dataset.get_train_indices()
            new_sampler_v = data.sampler.SubsetRandomSampler(indices=indices_v)
            val_data_loader.batch_sampler.sampler = new_sampler_v
            
            images, captions = next(iter(val_data_loader))
                
            # Move batch of images and captions to GPU if CUDA is available.
            images = images.to(device)
            captions = captions.to(device)
                
            # Pass the inputs through the CNN-RNN model.
            features = encoder(images)
            outputs = decoder(features, captions)
                
            # Calculate the batch loss.
            loss = criterion(outputs.view(-1, vocab_size), captions.view(-1))
            
            # Valid loss
            valid_loss = loss.item()
            
            # Get training statistics.
            _train_loss = train_loss / print_every
            _p_train_loss = np.exp(_train_loss)
            
            #Get current time
            _t_now = datetime.utcnow().replace(tzinfo=pytz.utc).astimezone(pytz.timezone(tz)).strftime(localFormat)
            
            stats = '[%s] Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Perplexity: %5.4f, Valid Loss: %.4f' % (_t_now, epoch, num_epochs, i_step, total_step,  _train_loss, _p_train_loss, valid_loss)
            
            # Print and Reset
            print(stats)
            train_loss = 0.0
            
            # save model if validation loss has decreased
            if valid_loss <= valid_loss_min:
                print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(
                    valid_loss_min,
                    valid_loss))
                torch.save(decoder.state_dict(), os.path.join('./models', 'decoder-%d.pkl' % i_step))
                torch.save(encoder.state_dict(), os.path.join('./models', 'encoder-%d.pkl' % i_step))
                # saving the optimizer for future use
                torch.save(optimizer.state_dict(), os.path.join('./models', 'optim-%d.pkl' % i_step))
                valid_loss_min = valid_loss
            
            # Append log
            with open(log_file, 'a+') as f:
                f.write(stats + '\n')
            
            
    # Save the weights.
    if epoch % save_every == 0:
        torch.save(decoder.state_dict(), os.path.join('./models', 'decoder-%d.pkl' % epoch))
        torch.save(encoder.state_dict(), os.path.join('./models', 'encoder-%d.pkl' % epoch))
        # saving the optimizer for future use
        torch.save(optimizer.state_dict(), os.path.join('./models', 'optim-%d.pkl' % epoch))

[2020-06-06 16:03:10] Epoch [21/25], Step [100/1618], Loss: 2.1424, Perplexity: 8.5195, Valid Loss: 2.2964
Validation loss decreased (inf --> 2.296426).  Saving model ...
[2020-06-06 16:23:45] Epoch [21/25], Step [200/1618], Loss: 2.1725, Perplexity: 8.7803, Valid Loss: 2.3142
