In [22]:
%matplotlib inline
from re import sub 

import numpy as np
import torch
from pycocotools.coco import COCO
# import matplotlib.pyplot as plt
# import skimage.io as io

from agent import Agent
from environment import Environment
from settings import *

ModuleNotFoundError: No module named 'skimage'

In [2]:
NUM_SAMPLES = 500
SPLIT = 'val'

In [3]:
BASE_LSTM = 'RETRAIN-0512-2307-E9'
SCST = 'RL-0516-2109-E4'
CONTEXT = 'RL-0517-0417-E4'

In [4]:
env = Environment()
agent = Agent(env=env)
agent.actor.load_state_dict(torch.load(
    MODEL_DIR.format(BASE_LSTM), map_location=None if USE_CUDA else 'cpu'
)['model_state_dict'])
print('LOADED ACTOR WITH BASE_LSTM WEIGHTS: ', BASE_LSTM)
coco = COCO(CAPTIONS_DIR.format('val'))

with open(KARPATHY_SPLIT_DIR.format(SPLIT)) as f:
    img_ids = f.read().split('\n')[:-1]
img_ids = [int(x.split()[-1]) for x in img_ids]

LOADED ACTOR WITH BASE_LSTM WEIGHTS:  RETRAIN-0512-2307-E9
loading annotations into memory...
Done (t=0.35s)
creating index...
index created!


In [5]:
img_ids_ = np.random.choice(img_ids, size=NUM_SAMPLES)

In [6]:
caption_ids = coco.getAnnIds(img_ids_)
captions = np.array([' '.join([sub(r'[^\w ]', '', caption['caption'].lower()).strip(), '<EOS>'])
                    for caption in coco.loadAnns(caption_ids)]).reshape(NUM_SAMPLES, -1)

# UGH
captions = list(map(list, captions))
ground_truth = dict(zip(img_ids_, captions))

In [7]:
img_features = torch.Tensor(
    [np.load(FEATURES_DIR.format(img_id))
     for img_id in img_ids_])

img_features.shape

torch.Size([500, 36, 2048])

In [8]:
predictions = agent.predict_captions(img_features, mode='greedy', constrain=True)
print(predictions[:10])

predictions_ = dict(zip(img_ids_, predictions))

[['a bunch of toilets in a room with a lot of toilets <EOS>'], ['a dog is standing in the grass with a frisbee <EOS>'], ['a person riding a horse on a dirt road <EOS>'], ['a man in a helmet is riding a motorcycle <EOS>'], ['a group of men playing a game of frisbee <EOS>'], ['a train is pulling into a station with a train <EOS>'], ['a person laying on a couch with a laptop computer <EOS>'], ['a man in a suit and tie holding a red shirt <EOS>'], ['a motorcycle parked in a field next to a field <EOS>'], ['a small wooden bench sitting in front of a tree <EOS>']]


In [9]:
mean, scores = env.cider.compute_score(ground_truth, predictions_)
print(mean, scores[:10])

1.082094715686786 [0.49254889 1.08068883 1.20984291 1.14672698 0.72600969 0.30716041
 0.49112662 0.99049065 1.16946936 0.61320396]


In [10]:
print(mean)
top_idxs = np.argsort(scores)[-5:]
worst_idxs = np.argsort(scores)[:5]

print('Image IDs where we got the highest scores: ', img_ids_[top_idxs])
print('with scores: ', scores[top_idxs])
print('Image IDs where we got the worst scores: ', img_ids_[worst_idxs])
print('with scores: ', scores[worst_idxs])

1.082094715686786
Image IDs where we got the highest scores:  [166696  33924 556101  93276 482242]
with scores:  [3.97740436 3.98211979 4.04241994 4.2829301  4.60229292]
Image IDs where we got the worst scores:  [550980 246398 293452 483013 249219]
with scores:  [0.00325284 0.00369906 0.00371667 0.00703358 0.00827268]


In [11]:
# insert code here to show the pictures...
img = coco.loadImgs(imgId)[0]
I = io.imread('%s/images/%s/%s'%(dataDir,dataType,img['file_name']))
plt.imshow(I)
plt.axis('off')
plt.show()

In [12]:
print('Captions from top scoring:')
print(np.array(predictions)[top_idxs])

print('Captions from top scoring:')
print(np.array(predictions)[worst_idxs])

Captions from top scoring:
[['a baseball player is throwing a ball in a baseball game <EOS>']
 ['a cat is laying on a couch with a remote <EOS>']
 ['a woman is walking down a street while holding a camera <EOS>']
 ['a man in a red shirt and red shirt and red shirt <EOS>']
 ['a man sitting in a chair with a dog on his lap <EOS>']]
Captions from top scoring:
[['two men in suits standing next to a train <EOS>']
 ['a man on a skateboard doing a trick <EOS>']
 ['two birds standing on a beach next to a pile of hay <EOS>']
 ['a bear is walking through a river in a forest <EOS>']
 ['a double decker bus driving down a street <EOS>']]


In [26]:
worst_idxs

array([342, 136, 159, 357, 182])

In [13]:
print('Ground truth captions for top scoring: ')
print(np.array(captions)[top_idxs])

Ground truth captions for top scoring: 
[['baseball team attempting to catch ball and tag player out <EOS>'
  'baseball players throw the ball back and forth to get the runner out <EOS>'
  'a couple of men running around a baseball field <EOS>'
  'several men playing baseball try to keep the runner from scoring <EOS>'
  'a basebakl player runs around the bases as a ball is thrown to a fielder <EOS>']
 ['a kitten on a bed in a blanket and a hand holding an electric toothbrush <EOS>'
  'a hand holding a toothbrush is near a cat on a bed <EOS>'
  'a person holding an electric tooth brush next to a cat sleeping on a bed <EOS>'
  'a person under a blanket with a cat laying next to himher and holding a toothbrush <EOS>'
  'a person is holding his toothbrush next to a sleeping cat <EOS>']
 ['a person standing in front of a bus on a street <EOS>'
  'photographer standing on roadway in front of bus taking picture <EOS>'
  'a person photographing something while standing in front of a bus <EOS>'

In [27]:
print('Ground truth captions for worst scoring: ')
print(np.array(captions)[worst_idxs])

Ground truth captions for worst scoring: 
[['there are several people standing at the end of a train going to rockaway <EOS>'
  'a group of men wearing eye glasses and ties on a train <EOS>'
  'some people standing on a train on railroad tracks <EOS>'
  'four persons in a train going to rockaway <EOS>'
  'a group of men on the back of a train <EOS>']
 ['a man riding a skateboard on top of a metal rail <EOS>'
  'a skate boarder going down a stair railing <EOS>'
  'a young man skateboarding on a metal rail <EOS>'
  'young man riding a skateboard on a rail <EOS>'
  'the boy rides the skateboard on the rail <EOS>']
 ['three birds are looking around while on the ground <EOS>'
  'these three birds are walking along the beach looking for food <EOS>'
  'sea birds walking on wet sand at the beach <EOS>'
  'three small birds standing on a sandy beach <EOS>'
  'three birds stand around on a sandy beach <EOS>']
 ['a brown bear walking across a river near a river <EOS>'
  'a bear drinking water in 

## Evaluate SCST on the same images

In [14]:
agent_scst = Agent(env=env)
agent_scst.actor.load_state_dict(torch.load(
    MODEL_DIR.format(SCST), map_location=None if USE_CUDA else 'cpu'
)['model_state_dict'])
print('LOADED ACTOR WITH SCST WEIGHTS: ', SCST)

LOADED ACTOR WITH SCST WEIGHTS:  RL-0516-2109-E4


In [15]:
predictions = agent_scst.predict_captions(img_features, mode='greedy', constrain=True)
print(predictions[:10])

predictions_ = dict(zip(img_ids_, predictions))

[['a group of toilets sitting in a bathroom <EOS>'], ['a black and white dog standing in the grass of a black bear <EOS>'], ['a man riding a horse on a beach <EOS>'], ['a man is riding a motorcycle in the street <EOS>'], ['a group of men playing a frisbee in a field <EOS>'], ['a train is sitting at a train station <EOS>'], ['a group of cats sitting on top of a laptop <EOS>'], ['a man in a suit and tie holding a tie <EOS>'], ['a motorcycle parked in the grass of a field <EOS>'], ['a bench sitting on top of a park <EOS>']]


In [16]:
mean, scores = env.cider.compute_score(ground_truth, predictions_)

In [17]:
print(np.array(predictions)[top_idxs])
print(np.array(predictions)[worst_idxs])

[['a baseball player throwing a ball on a field <EOS>']
 ['a cat laying on top of a couch with a cell phone <EOS>']
 ['a woman walking down a street with a bus <EOS>']
 ['a baseball player holding a bat on a field <EOS>']
 ['a man sitting on a couch with a dog <EOS>']]
[['a man standing next to a train <EOS>']
 ['a man riding a skateboard on a ramp <EOS>']
 ['a group of birds standing on the beach <EOS>']
 ['a brown bear is standing in the water <EOS>']
 ['a double decker bus is on a city street <EOS>']]


### Notice some improvements on the sentences!

## Evaluate CIDEr + Context Reward on the same images
### ($\beta = 0.5$)

In [18]:
agent_context = Agent(env=env)
agent_context.actor.load_state_dict(torch.load(
    MODEL_DIR.format(CONTEXT), map_location=None if USE_CUDA else 'cpu'
)['model_state_dict'])
print('LOADED ACTOR WITH SCST WEIGHTS: ', SCST)

LOADED ACTOR WITH SCST WEIGHTS:  RL-0516-2109-E4


In [19]:
predictions = agent_context.predict_captions(img_features, mode='greedy', constrain=True)
print(predictions[:10])

predictions_ = dict(zip(img_ids_, predictions))

[['a bathroom with a toilet and a sink <EOS>'], ['a black and white dog is standing in a field <EOS>'], ['a man is riding a horse on a beach <EOS>'], ['a man is riding a motorcycle in the dirt <EOS>'], ['a group of people playing frisbee in a field <EOS>'], ['a train is sitting on the tracks <EOS>'], ['a group of people sitting on top of a bed <EOS>'], ['a man is wearing a tie and tie <EOS>'], ['a motorcycle is parked in the grass <EOS>'], ['a bench sitting on top of a bench <EOS>']]


In [20]:
mean, scores = env.cider.compute_score(ground_truth, predictions_)

In [21]:
print(np.array(predictions)[top_idxs])
print(np.array(predictions)[worst_idxs])

[['a baseball player is throwing a ball on a field <EOS>']
 ['a cat is laying on top of a couch <EOS>']
 ['a woman is standing in front of a bus <EOS>']
 ['a man is holding a baseball bat on a field <EOS>']
 ['a man is sitting on a couch with a dog <EOS>']]
[['a man is standing in front of a truck <EOS>']
 ['a man is riding a skateboard down a street <EOS>']
 ['a group of birds are standing on the beach <EOS>']
 ['a brown bear is standing in the water <EOS>']
 ['a double decker bus is driving down the street <EOS>']]


## Evaluate CIDEr + Context Reward on the same images
### ($\beta = 0.75$)

# Observations
1. CIDEr-optimized captions lack complete sentence structure. For example, the word 'is' disappears from the predictions, while it is present in the base cross-entropy trained model.
2. CIDEr-optimized captions are much shorter and convey less scene context. For example, the base model predicts `a woman is walking down a street while holding a camera <EOS>`, but the CIDEr- optimized only predicts `'a woman walking down a street with a bus <EOS>'`. This removes the detail that the woman is holding a camera, but also mistakenly says that the woman is walking with a bus. Adding a context reward to CIDEr optimization **need to add comparisons from context-reward predictions**. Another example is `'a man sitting in a chair with a dog on his lap <EOS>'` from the base model, but CIDEr-optimized simplifies this into `'a man sitting on a couch with a dog <EOS>'`, lacking detail. Four out of the five ground truth captions include the detail that the dog is on the lap.

Papers [CITE] directly optimize for CIDEr, and has shown that optimizing for CIDEr also increases other commonly used metrics such as BLEU. However, the observations mentioned show that directly optimizing for CIDEr is not a good end-all be-all method to improve the quality of the captions.  (other papers may have also said this before). 

While the base LSTM model trained with cross-entropy incorporates more detail into its predictions, it is a rigid way of generating text. methods in which the agent is incentivized by the common metrics such as CIDEr and BLEU does not take full advantage of reinforcement learning paradigm, as it still forms a rigid learning goal. Training an agent based on context may be more beneficial in the long run, as it can learn to associate certain contexts with images, instead of relying on word order and word combinations, as is done with direct CIDEr optimization.

Through this simple experiment on adding a context reward term, it may be possible to create a (middleground???). However, further experiments on the addition of the context reward term is necessary. Moreover, incorporating exploration strategies may prove to be useful, as it can take advantage of the more lenient context reward.

In [40]:
np.array(img_ids_)[top_idxs]

array([166696,  33924, 556101,  93276, 482242])

In [41]:
np.array(img_ids_)[worst_idxs]

array([550980, 246398, 293452, 483013, 249219])