In [1]:
import numpy as np
import pickle
import sys

In [2]:
sys.path.append('../coco-caption')
from pycocoevalcap.bleu.bleu import Bleu
from pycocoevalcap.rouge.rouge import Rouge
from pycocoevalcap.cider.cider import Cider
from pycocoevalcap.meteor.meteor import Meteor

# Load Ground Truth

In [3]:
gts = np.load('./dataset/training_groundtruth')
len(gts)

19551

In [4]:
groundtruth = {}
for key, value in gts.iteritems():
    groundtruth[key] = '. '.join(value[1])

In [5]:
groundtruth

{2359296: u'A white bed is in a room. There is a white rug under the bed. There are two small tables next to the bed. There are two small white lamps on the tables',
 2392065: u"We have a bird's eye view of a very small seating area that is in need of decluttering and cleaning. Most prominent in the photo is a sofa covered sloppily by a plaid sheet of various shades of orange, green and cream. A maroon pillow sits in the corner nook of the sofa. Alongside it is a bookshelf and wooden table, all loaded with haphazard items. Also included in the photo is a stereo speaker with a Union Jack tape roll on it, and a desk with an open laptop, nearly empty coffee mug and other items on it. The floor is brick colored tile, and the wall is a light pink shade.",
 2326531: u'A man stands in the open doorway of a tour bus. There is writing on the side of a bus. There are many windows on the bus. The bus is blue and white. The man is holding a phone. the man is wearing sunglasses. The man is wearing 

# Evaluate training_nobias_90.txt

In [6]:
with open('./training_nobias_90.txt', 'rb') as f:
    hypo = f.read().splitlines()

In [7]:
hypothesis = {}
for i in range(0, len(hypo), 2):
    this_id = hypo[i].split(":")[0]
    if (i+1 < len(hypo)):
        hypothesis[int(this_id)] = str(hypo[i+1])

In [8]:
hypothesis

{2359296: 'A white bed is in a room. There is a white wooden table in the center. There are two small tables next to the bed. ',
 2326531: 'A man stands in the doorway of a bus. There is writing on the side of the bus. There are many windows on the bus. The bus is blue and white. The man is wearing a hat. ',
 2359300: 'A large airplane is flying in the sky. There are blue clouds in the sky. ',
 2397526: 'Three pizzas are sitting on a counter top in the middle of a stove. Two of the pizzas have their numbers on them. ',
 2359302: 'A double decker blue bus has a long sleeved front and a red stripe at the top. The bus has the words " <unk> ginger biscuits" on its side in black lettering. The bus has black tires and a large window on the side. The rear wheel on the bus is dark blue. A tall tree trunk is behind the man with green leaves on it. ',
 2408449: 'The boy is holding a bat. He is wearing a <unk> uniform on his face. The kid is wearing a black helmet. The cap is black. Behind the bo

In [9]:
keys_hypothesis = set(hypothesis.keys())
keys_groundtruth = set(groundtruth.keys())
intersection = keys_hypothesis & keys_groundtruth

In [13]:
len(intersection)

14166

In [10]:
evaluation_hypothesis = {}
evaluation_groundtruth = {}
for key in intersection:
    evaluation_hypothesis[key] = hypothesis[key]
    evaluation_groundtruth[key] = groundtruth[key]

In [11]:
scorers = [
        (Bleu(4),["Bleu_1","Bleu_2","Bleu_3","Bleu_4"])
    ]
final_scores = {}
for scorer,method in scorers:
    score,scores = scorer.compute_score(evaluation_groundtruth,evaluation_hypothesis)
    if type(score)==list:
        for m,s in zip(method,score):
            final_scores[m] = s
    else:
        final_scores[method] = score
            
print 'Bleu_1:\t',final_scores['Bleu_1']  
print 'Bleu_2:\t',final_scores['Bleu_2']  
print 'Bleu_3:\t',final_scores['Bleu_3']  
print 'Bleu_4:\t',final_scores['Bleu_4']

{'reflen': 14064, 'guess': [571701, 557637, 543573, 529509], 'testlen': 571701, 'correct': [20177, 0, 0, 0]}
ratio: 40.6499573379
Bleu_1:	0.0352929240984
Bleu_2:	7.95551007395e-12
Bleu_3:	4.88306750123e-15
Bleu_4:	1.21773057163e-16


In [12]:
scorers = [
        (Rouge(),"ROUGE_L"),
    ]
final_scores = {}
for scorer,method in scorers:
    score,scores = scorer.compute_score(evaluation_groundtruth,evaluation_hypothesis)
    if type(score)==list:
        for m,s in zip(method,score):
            final_scores[m] = s
    else:
        final_scores[method] = score
            
print 'ROUGE_L:',final_scores['ROUGE_L']  

ROUGE_L: 0.06944817817832258


In [14]:
scorers = [
        (Cider(),"CIDEr")
    ]
final_scores = {}
for scorer,method in scorers:
    score,scores = scorer.compute_score(evaluation_groundtruth,evaluation_hypothesis)
    if type(score)==list:
        for m,s in zip(method,score):
            final_scores[m] = s
    else:
        final_scores[method] = score
             
print 'CIDEr:\t',final_scores['CIDEr']

CIDEr:	5.534504230405034e-07


# Evaluate validation_nobias_90.txt

In [6]:
with open('./validation_nobias_90.txt', 'rb') as f:
    hypo = f.read().splitlines()

In [7]:
hypothesis = {}
for i in range(0, len(hypo), 2):
    this_id = hypo[i].split(":")[0]
    if (i+1 < len(hypo)):
        hypothesis[int(this_id)] = str(hypo[i+1])

In [8]:
keys_hypothesis = set(hypothesis.keys())
keys_groundtruth = set(groundtruth.keys())
intersection = keys_hypothesis & keys_groundtruth

In [9]:
evaluation_hypothesis = {}
evaluation_groundtruth = {}
for key in intersection:
    evaluation_hypothesis[key] = hypothesis[key]
    evaluation_groundtruth[key] = groundtruth[key]

In [10]:
scorers = [
        (Bleu(4),["Bleu_1","Bleu_2","Bleu_3","Bleu_4"])
    ]
final_scores = {}
for scorer,method in scorers:
    score,scores = scorer.compute_score(evaluation_groundtruth,evaluation_hypothesis)
    if type(score)==list:
        for m,s in zip(method,score):
            final_scores[m] = s
    else:
        final_scores[method] = score
            
print 'Bleu_1:\t',final_scores['Bleu_1']  
print 'Bleu_2:\t',final_scores['Bleu_2']  
print 'Bleu_3:\t',final_scores['Bleu_3']  
print 'Bleu_4:\t',final_scores['Bleu_4']

{'reflen': 2419, 'guess': [99442, 97023, 94604, 92185], 'testlen': 99442, 'correct': [3084, 0, 0, 0]}
ratio: 41.1087226126
Bleu_1:	0.0310130528348
Bleu_2:	1.7878657716e-11
Bleu_3:	1.50056025833e-14
Bleu_4:	4.37547306916e-16


In [11]:
scorers = [
        (Rouge(),"ROUGE_L"),
    ]
final_scores = {}
for scorer,method in scorers:
    score,scores = scorer.compute_score(evaluation_groundtruth,evaluation_hypothesis)
    if type(score)==list:
        for m,s in zip(method,score):
            final_scores[m] = s
    else:
        final_scores[method] = score
            
print 'ROUGE_L:',final_scores['ROUGE_L'] 

ROUGE_L: 0.060132185119870805


In [12]:
scorers = [
        (Cider(),"CIDEr")
    ]
final_scores = {}
for scorer,method in scorers:
    score,scores = scorer.compute_score(evaluation_groundtruth,evaluation_hypothesis)
    if type(score)==list:
        for m,s in zip(method,score):
            final_scores[m] = s
    else:
        final_scores[method] = score
             
print 'CIDEr:\t',final_scores['CIDEr']

CIDEr:	4.454507872414481e-08


# Evaluate training_normalized_bias_130.txt

In [22]:
with open('./training_normalized_bias_130.txt', 'rb') as f:
    hypo = f.read().splitlines()

In [23]:
hypothesis = {}
for i in range(0, len(hypo), 2):
    this_id = hypo[i].split(":")[0]
    if (i+1 < len(hypo)):
        hypothesis[int(this_id)] = str(hypo[i+1])

In [24]:
keys_hypothesis = set(hypothesis.keys())
keys_groundtruth = set(groundtruth.keys())
intersection = keys_hypothesis & keys_groundtruth

In [25]:
evaluation_hypothesis = {}
evaluation_groundtruth = {}
for key in intersection:
    evaluation_hypothesis[key] = hypothesis[key]
    evaluation_groundtruth[key] = groundtruth[key]

In [26]:
scorers = [
        (Bleu(4),["Bleu_1","Bleu_2","Bleu_3","Bleu_4"])
    ]
final_scores = {}
for scorer,method in scorers:
    score,scores = scorer.compute_score(evaluation_groundtruth,evaluation_hypothesis)
    if type(score)==list:
        for m,s in zip(method,score):
            final_scores[m] = s
    else:
        final_scores[method] = score
            
print 'Bleu_1:\t',final_scores['Bleu_1']  
print 'Bleu_2:\t',final_scores['Bleu_2']  
print 'Bleu_3:\t',final_scores['Bleu_3']  
print 'Bleu_4:\t',final_scores['Bleu_4']

{'reflen': 14064, 'guess': [656045, 641981, 627917, 613853], 'testlen': 656045, 'correct': [20603, 0, 0, 0]}
ratio: 46.6471131968
Bleu_1:	0.0314048578985
Bleu_2:	6.99418843682e-12
Bleu_3:	4.27094657455e-15
Bleu_4:	1.06139492924e-16


In [27]:
scorers = [
        (Rouge(),"ROUGE_L"),
    ]
final_scores = {}
for scorer,method in scorers:
    score,scores = scorer.compute_score(evaluation_groundtruth,evaluation_hypothesis)
    if type(score)==list:
        for m,s in zip(method,score):
            final_scores[m] = s
    else:
        final_scores[method] = score
            
print 'ROUGE_L:',final_scores['ROUGE_L'] 

  if(string[i-1] == sub[j-1]):


ROUGE_L: 0.11433021046825093


In [28]:
scorers = [
        (Cider(),"CIDEr")
    ]
final_scores = {}
for scorer,method in scorers:
    score,scores = scorer.compute_score(evaluation_groundtruth,evaluation_hypothesis)
    if type(score)==list:
        for m,s in zip(method,score):
            final_scores[m] = s
    else:
        final_scores[method] = score
             
print 'CIDEr:\t',final_scores['CIDEr']

CIDEr:	2.8717619331341946e-07


# Evaluate validation_normalized_bias_130.txt

In [29]:
with open('./validation_normalized_bias_130.txt', 'rb') as f:
    hypo = f.read().splitlines()

In [30]:
hypothesis = {}
for i in range(0, len(hypo), 2):
    this_id = hypo[i].split(":")[0]
    if (i+1 < len(hypo)):
        hypothesis[int(this_id)] = str(hypo[i+1])

In [31]:
keys_hypothesis = set(hypothesis.keys())
keys_groundtruth = set(groundtruth.keys())
intersection = keys_hypothesis & keys_groundtruth

In [32]:
evaluation_hypothesis = {}
evaluation_groundtruth = {}
for key in intersection:
    evaluation_hypothesis[key] = hypothesis[key]
    evaluation_groundtruth[key] = groundtruth[key]

In [33]:
scorers = [
        (Bleu(4),["Bleu_1","Bleu_2","Bleu_3","Bleu_4"])
    ]
final_scores = {}
for scorer,method in scorers:
    score,scores = scorer.compute_score(evaluation_groundtruth,evaluation_hypothesis)
    if type(score)==list:
        for m,s in zip(method,score):
            final_scores[m] = s
    else:
        final_scores[method] = score
            
print 'Bleu_1:\t',final_scores['Bleu_1']  
print 'Bleu_2:\t',final_scores['Bleu_2']  
print 'Bleu_3:\t',final_scores['Bleu_3']  
print 'Bleu_4:\t',final_scores['Bleu_4']

{'reflen': 2419, 'guess': [114927, 112508, 110089, 107670], 'testlen': 114927, 'correct': [3132, 0, 0, 0]}
ratio: 47.5101281521
Bleu_1:	0.0272520817562
Bleu_2:	1.55635311527e-11
Bleu_3:	1.30064103143e-14
Bleu_4:	3.7808931983e-16


In [34]:
scorers = [
        (Rouge(),"ROUGE_L"),
    ]
final_scores = {}
for scorer,method in scorers:
    score,scores = scorer.compute_score(evaluation_groundtruth,evaluation_hypothesis)
    if type(score)==list:
        for m,s in zip(method,score):
            final_scores[m] = s
    else:
        final_scores[method] = score
            
print 'ROUGE_L:',final_scores['ROUGE_L'] 

ROUGE_L: 0.05282495051776447


In [35]:
scorers = [
        (Cider(),"CIDEr")
    ]
final_scores = {}
for scorer,method in scorers:
    score,scores = scorer.compute_score(evaluation_groundtruth,evaluation_hypothesis)
    if type(score)==list:
        for m,s in zip(method,score):
            final_scores[m] = s
    else:
        final_scores[method] = score
             
print 'CIDEr:\t',final_scores['CIDEr']

CIDEr:	4.344350737346062e-09
