In [1]:
%matplotlib inline
from re import sub 

import numpy as np
import torch
from pycocotools.coco import COCO
# import matplotlib.pyplot as plt
# import skimage.io as io

from agent import Agent
from environment import Environment
from settings import *

In [2]:
NUM_SAMPLES = 500
SPLIT = 'val'

In [3]:
BASE_LSTM = 'RETRAIN-0512-2307-E9'
SCST = 'RL-0516-1547-E9'
CONTEXT_1 = 'RL-0517-0417-E9'

In [10]:
env = Environment()
agent = Agent(env=env)
agent.actor.load_state_dict(torch.load(
    MODEL_DIR.format(BASE_LSTM), map_location=None if USE_CUDA else 'cpu'
)['model_state_dict'])
print('LOADED ACTOR WITH BASE_LSTM WEIGHTS: ', BASE_LSTM)
coco = COCO(CAPTIONS_DIR.format('val'))

with open(KARPATHY_SPLIT_DIR.format(SPLIT)) as f:
    img_ids = f.read().split('\n')[:-1]
img_ids = [int(x.split()[-1]) for x in img_ids]

LOADED ACTOR WITH BASE_LSTM WEIGHTS:  RETRAIN-0512-2307-E9
loading annotations into memory...
Done (t=0.26s)
creating index...
index created!


In [33]:
img_ids_ = np.random.choice(img_ids, size=NUM_SAMPLES)

In [34]:
caption_ids = coco.getAnnIds(img_ids_)
captions = np.array([' '.join([sub(r'[^\w ]', '', caption['caption'].lower()).strip(), '<EOS>'])
                    for caption in coco.loadAnns(caption_ids)]).reshape(NUM_SAMPLES, -1)

# UGH
captions = list(map(list, captions))
ground_truth = dict(zip(img_ids_, captions))

In [35]:
img_features = torch.Tensor(
    [np.load(FEATURES_DIR.format(img_id))
     for img_id in img_ids_])

In [36]:
predictions = agent.predict_captions(img_features, mode='greedy', constrain=True)
predictions_ = dict(zip(img_ids_, predictions))
mean, scores = env.cider.compute_score(ground_truth, predictions_)
print(mean)

1.0530917078059252


In [51]:
# top_idxs = np.argsort(scores)[-5:]
# worst_idxs = np.argsort(scores)[:5]

# print('Image IDs where we got the highest scores: ', img_ids_[top_idxs])
# print('with scores: ', scores[top_idxs])
# print('Image IDs where we got the worst scores: ', img_ids_[worst_idxs])
# print('with scores: ', scores[worst_idxs])

# GET 5 RANDOM INDECES
idxs = np.random.choice(len(scores), 10)
top_idxs = idxs[:5]
worst_idxs = idxs[5:]

In [None]:
# # insert code here to show the pictures...
# img = coco.loadImgs(imgId)[0]
# I = io.imread('%s/images/%s/%s'%(dataDir,dataType,img['file_name']))
# plt.imshow(I)
# plt.axis('off')
# plt.show()

In [52]:
print('Captions from top scoring:')
print(np.array(predictions)[top_idxs])

print('Captions from worst scoring:')
print(np.array(predictions)[worst_idxs])

Captions from top scoring:
[['a group of motorcycles parked on the side of a street <EOS>']
 ['a traffic light on the side of a street <EOS>']
 ['a group of people walking down a street with a bus <EOS>']
 ['a man holding a tennis ball on a tennis court <EOS>']
 ['a train is sitting at a train station <EOS>']]
Captions from worst scoring:
[['a herd of horses are walking in the water <EOS>']
 ['a boat is sitting in the water <EOS>']
 ['a man is flying a kite on the beach <EOS>']
 ['a bathroom with a toilet and a sink <EOS>']
 ['a group of people flying kites in a field <EOS>']]


In [53]:
print('Ground truth captions for top scoring: ')
print(np.array(captions)[top_idxs])

Ground truth captions for top scoring: 
[['one bicycle is parked next to many motorcycles <EOS>'
  'a row of motorcycles next to a bicycle <EOS>'
  'a group of bikes and motorcycles parked on a city street <EOS>'
  'bicycles and scooters lined up along the sidewalk in a quaint town <EOS>'
  'bicycles lined up on the side of the road <EOS>']
 ['a traffic signal sitting next to a street at night <EOS>'
  'traffic light at night appearing very confusing <EOS>'
  'the electronic stop sign glows brightly at night time <EOS>'
  'a variety of traffic lights and road signs <EOS>'
  'there is a street light with two green arrows in different directions <EOS>']
 ['an ambulance and police cars are stopped at the scene of an accident <EOS>'
  'an ambulance and police are on the side of the road <EOS>'
  'an ambulance police cars and firetruck attending to some kind of road emergency <EOS>'
  'an ambulance and cop cars at an accident in a street <EOS>'
  'an accident on a road with an ambulance and

In [54]:
print('Ground truth captions for worst scoring: ')
print(np.array(captions)[worst_idxs])

Ground truth captions for worst scoring: 
[['a bunch of horses that are standing in the water <EOS>'
  'two men riding horses and many horses water dirt and trees <EOS>'
  'two men on horseback herd some horses across a stream <EOS>'
  'a herd of horses runs through a stream <EOS>'
  'cowboys herding horses across a small stream in a valley <EOS>']
 ['a large metal fork sticking out of a lake next to a boat <EOS>'
  'an image a large fork perched in the water <EOS>'
  'a seemingly very large fork stuck in the water  with a ship behind it <EOS>'
  'a large fork sculpture stands in the water as a large boat passes <EOS>'
  'a large fork on the water where a boat is in the background <EOS>']
 ['a person flying a kite on a beach at dusk <EOS>'
  'two people on the beach and one is flying a kite <EOS>'
  'a couple of people on a beach flying a kite <EOS>'
  'people flying a yellow kite on at sunrise on a beach <EOS>'
  'a kite flying in the air over a sand castle <EOS>']
 ['a bathroom scene

## Evaluate SCST on the same images

In [57]:
agent_scst = Agent(env=env)
agent_scst.actor.load_state_dict(torch.load(
    MODEL_DIR.format(SCST), map_location=None if USE_CUDA else 'cpu'
)['model_state_dict'])
print('LOADED ACTOR WITH SCST WEIGHTS: ', SCST)

LOADED ACTOR WITH SCST WEIGHTS:  RL-0516-1547-E9


In [58]:
predictions = agent_scst.predict_captions(img_features, mode='greedy', constrain=True)
predictions_ = dict(zip(img_ids_, predictions))
mean, scores = env.cider.compute_score(ground_truth, predictions_)
print(mean)

1.1298931554750955


In [59]:
print(np.array(predictions)[top_idxs])
print(np.array(predictions)[worst_idxs])

[['a group of motorcycles parked on the side of a street <EOS>']
 ['a traffic light on the side of a street <EOS>']
 ['a group of people walking down a street with a bus <EOS>']
 ['a man holding a tennis ball on a tennis court <EOS>']
 ['a train is sitting at a train station <EOS>']]
[['a herd of horses are walking in the water <EOS>']
 ['a boat is sitting in the water <EOS>']
 ['a man is flying a kite on the beach <EOS>']
 ['a bathroom with a toilet and a sink <EOS>']
 ['a group of people flying kites in a field <EOS>']]


### Notice some improvements on the sentences!

## Evaluate CIDEr + Context Reward on the same images
### ($\beta = 0.5$)

Version 1: 180 - mean_pred_dist

Up to Epoch=5

In [None]:
agent_context = Agent(env=env)
agent_context.actor.load_state_dict(torch.load(
    MODEL_DIR.format('RL-0517-0417-E4'), map_location=None if USE_CUDA else 'cpu'
)['model_state_dict'])
# print('LOADED ACTOR WITH SCST WEIGHTS: ', SCST)

In [None]:
predictions = agent_context.predict_captions(img_features, mode='greedy', constrain=True)
predictions_ = dict(zip(img_ids_, predictions))

In [None]:
mean, scores = env.cider.compute_score(ground_truth, predictions_)
print(mean)

In [None]:
print(np.array(predictions)[top_idxs])
print(np.array(predictions)[worst_idxs])

## Evaluate CIDEr + Context Reward on the same images
### ($\beta = 0.5$)

Version 1: 180 - mean_pred_dist

Up to Epoch=10

In [24]:
agent_context = Agent(env=env)
agent_context.actor.load_state_dict(torch.load(
    MODEL_DIR.format('RL-0517-0417-E9'), map_location=None if USE_CUDA else 'cpu'
)['model_state_dict'])
# print('LOADED ACTOR WITH SCST WEIGHTS: ', SCST)

In [25]:
predictions = agent_context.predict_captions(img_features, mode='greedy', constrain=True)
predictions_ = dict(zip(img_ids_, predictions))
mean, scores = env.cider.compute_score(ground_truth, predictions_)
print(mean)

1.067835275130549


In [26]:
print(np.array(predictions)[top_idxs])
print(np.array(predictions)[worst_idxs])

[['a man is doing a trick on a skateboard <EOS>']
 ['a motorcycle is parked on the side of a road <EOS>']
 ['a woman is sitting in the rain with an umbrella <EOS>']
 ['a pair of scissors sitting on top of a table <EOS>']
 ['a clock tower on the top of a building <EOS>']]
[['two zebras are standing in a field <EOS>']
 ['a living room with a couch and a table <EOS>']
 ['a train is sitting at a train station <EOS>']
 ['a bathroom with a toilet and a sink <EOS>']
 ['a man is riding a wave on a surfboard <EOS>']]


## Evaluate CIDEr + Context Reward on the same images
### ($\beta = 0.1$, LR=5e-4)

Version 2: `1 - [(gt - pred) / gt]`

Up to Epoch=5, Greedy context up to 0.84~

In [None]:
agent_context = Agent(env=env)
agent_context.actor.load_state_dict(torch.load(
    MODEL_DIR.format('RL-0522-0942-E4'), map_location=None if USE_CUDA else 'cpu'
)['model_state_dict'])
# print('LOADED ACTOR WITH SCST WEIGHTS: ', SCST)

In [None]:
predictions = agent_context.predict_captions(img_features, mode='greedy', constrain=True)
predictions_ = dict(zip(img_ids_, predictions))
mean, scores = env.cider.compute_score(ground_truth, predictions_)
print(mean)

In [None]:
print(np.array(predictions)[top_idxs])
print(np.array(predictions)[worst_idxs])

## Evaluate CIDEr + Context Reward on the same images
### ($\beta = 0.1$, LR=1e-4)

Version 2: `1 - [(gt - pred) / gt]`

Up to Epoch=5, Greedy context up to 0.8388

In [27]:
agent_context = Agent(env=env)
agent_context.actor.load_state_dict(torch.load(
    MODEL_DIR.format('RL-0522-1313-E4'), map_location=None if USE_CUDA else 'cpu'
)['model_state_dict'])
# print('LOADED ACTOR WITH SCST WEIGHTS: ', SCST)

In [28]:
predictions = agent_context.predict_captions(img_features, mode='greedy', constrain=True)
predictions_ = dict(zip(img_ids_, predictions))
mean, scores = env.cider.compute_score(ground_truth, predictions_)
print(mean)

0.9639909200550679


In [29]:
print(np.array(predictions)[top_idxs])
print(np.array(predictions)[worst_idxs])

[['a man riding a skateboard on a ramp <EOS>']
 ['a motorcycle parked on a street next to a tree <EOS>']
 ['a woman sitting on a umbrella with a umbrella <EOS>']
 ['a pair of scissors sitting on a cutting board <EOS>']
 ['a building with a clock tower sitting on top <EOS>']]
[['a group of zebras standing on a grass <EOS>']
 ['a living room with a white couch and table <EOS>']
 ['a train sitting on a street next to a street sign <EOS>']
 ['a bathroom with a white toilet and sink <EOS>']
 ['a man riding a surfboard on a wave <EOS>']]


## Evaluate CIDEr + Context Reward on the same images
### ($\beta = 0.1$, LR=1e-4)

Version 2: `1 - [(gt - pred) / gt]`

Up to Epoch=10, Greedy context up to 0.8536

In [60]:
agent_context = Agent(env=env)
agent_context.actor.load_state_dict(torch.load(
    MODEL_DIR.format('RL-0522-1313-E9'), map_location=None if USE_CUDA else 'cpu'
)['model_state_dict'])
# print('LOADED ACTOR WITH SCST WEIGHTS: ', SCST)

In [61]:
predictions = agent_context.predict_captions(img_features, mode='greedy', constrain=True)
predictions_ = dict(zip(img_ids_, predictions))
mean, scores = env.cider.compute_score(ground_truth, predictions_)
print(mean)

1.0021111817846577


In [62]:
print(np.array(predictions)[top_idxs])
print(np.array(predictions)[worst_idxs])

[['a group of motorcycles parked on a street top <EOS>']
 ['a traffic light with traffic lights sitting on a street top <EOS>']
 ['a group of people standing on a street with a truck bus <EOS>']
 ['a man holding a tennis racket at a ball <EOS>']
 ['a train sitting on a train station <EOS>']]
[['a group of horses riding on a water river <EOS>']
 ['a boat with a surfboard sitting on the water <EOS>']
 ['a man flying a kite on the beach <EOS>']
 ['a bathroom with a white toilet and sink <EOS>']
 ['a group of people flying kites on a field <EOS>']]


## Evaluate CIDEr + Context Reward on the same images
### ($\beta = 0.1$, LR=1e-4)

Version 2: `1 - [(gt - pred) / gt]`

Up to Epoch=13, Greedy context up to 0.8604

In [63]:
agent_context = Agent(env=env)
agent_context.actor.load_state_dict(torch.load(
    MODEL_DIR.format('RL-0522-1313-E12'), map_location=None if USE_CUDA else 'cpu'
)['model_state_dict'])
# print('LOADED ACTOR WITH SCST WEIGHTS: ', SCST)

In [64]:
predictions = agent_context.predict_captions(img_features, mode='greedy', constrain=True)
predictions_ = dict(zip(img_ids_, predictions))
mean, scores = env.cider.compute_score(ground_truth, predictions_)
print(mean)

0.9988473615785126


In [65]:
print(np.array(predictions)[top_idxs])
print(np.array(predictions)[worst_idxs])

[['a group of motorcycles parked on a street top next to a street top <EOS>']
 ['a traffic light with traffic lights sitting on a street top <EOS>']
 ['a woman standing on a street top with a truck bus <EOS>']
 ['a man holding a tennis racket at a tennis ball <EOS>']
 ['a train sitting on a train tracks station <EOS>']]
[['a herd of horses walking on a water river <EOS>']
 ['a boat with boats sitting on the water <EOS>']
 ['a man flying a kite on the beach <EOS>']
 ['a bathroom with a white toilet and sink <EOS>']
 ['a group of people flying kites on a field top <EOS>']]


## Evaluate CIDEr + Context Reward on the same images
LR=1e-4

Version 3: `1 - [(gt - pred) / gt]` (but gt excludes 0 values from the mean)
CIDEr weight: 1.  Context weight: 2

Up to Epoch=4,

In [None]:
agent_context = Agent(env=env)
agent_context.actor.load_state_dict(torch.load(
    MODEL_DIR.format('RL-0523-0900-E3'), map_location=None if USE_CUDA else 'cpu'
)['model_state_dict'])
# print('LOADED ACTOR WITH SCST WEIGHTS: ', SCST)

In [None]:
predictions = agent_context.predict_captions(img_features, mode='greedy', constrain=True)
predictions_ = dict(zip(img_ids_, predictions))
mean, scores = env.cider.compute_score(ground_truth, predictions_)
print(mean)

In [None]:
print(np.array(predictions)[top_idxs])
print(np.array(predictions)[worst_idxs])

# Observations
1. CIDEr-optimized captions lack complete sentence structure. For example, the word 'is' disappears from the predictions, while it is present in the base cross-entropy trained model.
2. CIDEr-optimized captions are much shorter and convey less scene context. For example, the base model predicts `a woman is walking down a street while holding a camera <EOS>`, but the CIDEr- optimized only predicts `'a woman walking down a street with a bus <EOS>'`. This removes the detail that the woman is holding a camera, but also mistakenly says that the woman is walking with a bus. Adding a context reward to CIDEr optimization **need to add comparisons from context-reward predictions**. Another example is `'a man sitting in a chair with a dog on his lap <EOS>'` from the base model, but CIDEr-optimized simplifies this into `'a man sitting on a couch with a dog <EOS>'`, lacking detail. Four out of the five ground truth captions include the detail that the dog is on the lap.

Papers [CITE] directly optimize for CIDEr, and has shown that optimizing for CIDEr also increases other commonly used metrics such as BLEU. However, the observations mentioned show that directly optimizing for CIDEr is not a good end-all be-all method to improve the quality of the captions.  (other papers may have also said this before). 

While the base LSTM model trained with cross-entropy incorporates more detail into its predictions, it is a rigid way of generating text. methods in which the agent is incentivized by the common metrics such as CIDEr and BLEU does not take full advantage of reinforcement learning paradigm, as it still forms a rigid learning goal. Training an agent based on context may be more beneficial in the long run, as it can learn to associate certain contexts with images, instead of relying on word order and word combinations, as is done with direct CIDEr optimization.

Through this simple experiment on adding a context reward term, it may be possible to create a (middleground???). However, further experiments on the addition of the context reward term is necessary. Moreover, incorporating exploration strategies may prove to be useful, as it can take advantage of the more lenient context reward.

In [None]:
np.array(img_ids_)[top_idxs]

In [None]:
np.array(img_ids_)[worst_idxs]