In [1]:
import torch
import json
from eval import evaluate_with_beam
import torch.backends.cudnn as cudnn
import torchvision.transforms as transforms

data_folder = '../prepared_data'  # folder with data files saved by create_input_files.py
device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")  # sets device for model and PyTorch tensors
print("torch.version=", torch.__version__)
print("device=",device)
cudnn.benchmark = True  # set to true only if inputs to model are fixed size; otherwise lot of computational overhead

torch.version= 1.7.0+cu101
device= cuda:1


In [2]:
from checkpoints import data_names, models, word_maps

In [None]:
for data_name, model, word_map_file in zip(data_names, models, word_maps):

    print("Model: ", model)
    # Load model
    checkpoint = torch.load(model, 
                            #map_location=lambda storage, loc: storage.cuda(1)
                           )
    decoder = checkpoint['decoder']
    decoder = decoder.to(device)
    decoder.eval()
    encoder = checkpoint['encoder']
    encoder = encoder.to(device)
    encoder.eval()
    # Load word map (word2ix)
    with open(word_map_file, 'r') as j:
        word_map = json.load(j)
    rev_word_map = {v: k for k, v in word_map.items()}
    vocab_size = len(word_map)
    word_map_start = word_map['<start>']
    word_map_end = word_map['<end>']
    #print(vocab_size, word_map_start)
    # Normalization transform
    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])
    
    # Evaluate with beam search
    for beam_width in range(1,6):
        score = evaluate_with_beam(beam_width, data_name, model, encoder, decoder, word_map, word_map_start, word_map_end, rev_word_map)
        print(score)


Model:  ../checkpoints/BEST_checkpoint_flickr30kzh_5_cap_per_img_5_min_word_freq_seg_based.pth.tar


EVALUATING AT BEAM SIZE 1: 100%|██████████| 25005/25005 [01:48<00:00, 229.93it/s]
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


{'Bleu_1': 0.24102564102502302, 'Bleu_2': 0.11792084397473217, 'Bleu_3': 0.06603702488989932, 'Bleu_4': 5.511908199900016e-06, 'METEOR': 0.12179722697673562, 'ROUGE_L': 0.22077783065568057, 'CIDEr': 0.1975988025867745, 'SkipThoughtCS': 0.7516707, 'EmbeddingAverageCosineSimilarity': 0.858686, 'EmbeddingAverageCosineSimilairty': 0.858686, 'VectorExtremaCosineSimilarity': 0.65683, 'GreedyMatchingScore': 0.796867, 'bleu1nltk': 0.24102564102564103, 'bleu2nltk': 0.11792084397504535, 'bleu3nltk': 0.06785621390251059, 'bleu4nltk': 1.5910240048951185e-78}


EVALUATING AT BEAM SIZE 2: 100%|██████████| 25005/25005 [02:02<00:00, 204.61it/s]
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
EVALUATING AT BEAM SIZE 3:   0%|          | 0/25005 [00:00<?, ?it/s]

{'Bleu_1': 0.22143988426633962, 'Bleu_2': 0.07723343873940514, 'Bleu_3': 0.027154399310069993, 'Bleu_4': 2.9574784941586236e-06, 'METEOR': 0.10024852332527615, 'ROUGE_L': 0.21011531158183058, 'CIDEr': 0.2986306753591256, 'SkipThoughtCS': 0.836198, 'EmbeddingAverageCosineSimilarity': 0.869426, 'EmbeddingAverageCosineSimilairty': 0.869426, 'VectorExtremaCosineSimilarity': 0.67902, 'GreedyMatchingScore': 0.787383, 'bleu1nltk': 0.22143988426800454, 'bleu2nltk': 0.07723343874000155, 'bleu3nltk': 0.028058537752898887, 'bleu4nltk': 7.521373669965086e-79}


EVALUATING AT BEAM SIZE 3:  32%|███▏      | 8091/25005 [00:58<01:23, 202.45it/s]