In [1]:
import os
from glob import glob
import json
from nltk import word_tokenize, ngrams
import pandas as pd
import pickle
from itertools import chain

In [2]:
class Vocabulary(object):
    """Simple vocabulary wrapper."""
    def __init__(self):
        self.word2idx = {}
        self.idx2word = {}
        self.idx = 0
        self.unknown_token = '<unk>'

    def add_word(self, word):
        if not word in self.word2idx:
            self.word2idx[word] = self.idx
            self.idx2word[self.idx] = word
            self.idx += 1

    def __call__(self, word):
        if not word in self.word2idx:
            return self.word2idx[self.unknown_token]
        return self.word2idx[word]

    def __len__(self):
        return len(self.word2idx)
    

# from Van Miltenburg et al. 2018: Measuring the Diversity of Automatic Image Descriptions
# https://github.com/evanmiltenburg/MeasureDiversity


def chunks(l, n):
    """
    Yield successive n-sized chunks from l.
    
    From: https://stackoverflow.com/a/312464/2899924
    """
    for i in range(0, len(l), n):
        yield l[i:i + n]

def type_token_ratio(sentences, n=1000):
    """
    Compute average type-token ratio (normalized over n tokens)
    with a repeated sample of n words.
    """
    all_words = [word for sentence in sentences for word in sentence]
    ttrs = []
    if len(all_words) < n:
        print("Warning: not enough tokens!")
        return None
    for chunk in chunks(all_words, n):
        if len(chunk) == n:
            types = set(chunk)
            ttr = float(len(types))/n
            ttrs.append(ttr)
    final_ttr = float(sum(ttrs))/len(ttrs)
    return final_ttr


def ngram_ttr(sentences, n=2, window_size=1000):
    """
    Compute average ngram type-token ratio (normalized over window_size ngrams)
    with a repeated sample of n words.
    """
    all_ngrams = list(ngrams([word for sentence in sentences for word in sentence], n))
    ttrs = []
    for chunk in chunks(all_ngrams, window_size):
        if len(chunk) == window_size:
            types = set(chunk)
            ttr = float(len(types))/window_size
            ttrs.append(ttr)
    final_ttr = float(sum(ttrs))/len(ttrs)
    return final_ttr


def bigram_ttr(sentences):
    "Compute bigram TTR"
    return ngram_ttr(sentences, n=2)


def trigram_ttr(sentences):
    "Compute trigram TTR"
    return ngram_ttr(sentences, n=3)

def coverage(system_types, train_types):  # adopted to our setting
    """
    Compute coverage for a specific system.
    
    This function is agnostic to whether you want coverage over entire Val or only
    the set of learnable types.
    """
    recalled = system_types & train_types
    return {"recalled": recalled,
            "score": len(recalled)/len(train_types),
            "not_in_val": train_types - system_types}


In [3]:
def compute_scores(dset, split, input_dir, vocab_dir):
    
    setting = f'{split}{dset.replace("refcoco", "")}'
    print(setting)
       
    input_path = os.path.join(input_dir, f'{dset.replace("+", "plus")}/filtered')
    vocab_path = os.path.join(vocab_dir, f'{dset.replace("+","plus")}_vocab.pkl')
    files = glob(f'{input_path}/{dset}_{split}**')
                
    with open(vocab_path, 'rb') as f:
        vocab = pickle.load(f)
        model_vocab = {word for word in vocab.word2idx.keys() if word not in ['<pad>', '<start>', '<end>', '<unk>']}

    res = []

    for file in sorted(files):

        system = os.path.split(file)[-1].replace('_cleaned_filtered.json', ''
            ).replace(f'{dset}_', ''
            ).replace(f'{split}_', '')
        
        with open(file) as f:
            system_refexps = [word_tokenize(x['caption']) for x in json.load(f)]

        ttr1 = type_token_ratio(system_refexps)
        ttr2 = bigram_ttr(system_refexps)
        #ttr3 = trigram_ttr(system_refexps)

        system_types = set(chain(*system_refexps))
        system_coverage = coverage(system_types, model_vocab)
        
        res.append({
            'system': system,
            'TTR1': ttr1,
            'TTR2': ttr2,
            #'TTR3': ttr3,
            'cov.': system_coverage['score']
        })

    df = pd.DataFrame(res)
    ordering = ['greedy_l-na_r-na', 'beam_l-na_r-na', 'predfuse_es_l-0-7_r-na','predfuse_es_l-0-5_r-na','predfuse_es_l-0-3_r-na','rsa_l-na_r-0-5','rsa_l-na_r-1-0','rsa_l-na_r-5-0','annsample']
    df['system'] = pd.Categorical(df['system'], categories=ordering)
    df = df.set_index('system').sort_index()
        
    tuples = [(setting, c) for c in df.columns]
    columns = pd.MultiIndex.from_tuples(tuples)
    
    df.columns = columns
    
    return df

In [4]:
input_dir = '../data/model_expressions/'
vocab_dir = '../data/model_vocab/'

dfs = []

# RefCOCO / RefCOCO+

for dset in ['refcoco', 'refcoco+']:
    for split in ['testA', 'testB']:
        df = compute_scores(dset, split, input_dir, vocab_dir)
        dfs.append(df)
#        display(df)

# RefCOCOg
        
dset = 'refcocog'
split = 'test'

df = compute_scores(dset, split, input_dir, vocab_dir)
dfs.append(df)
#display(df)

combined_df = pd.concat(dfs, axis=1)

testA
testB
testA+
testB+
testg


In [5]:
combined_df = combined_df * 100

In [6]:
idx = combined_df.index.get_level_values(0)
combined_df.style.format('{:.1f}').background_gradient(subset=pd.IndexSlice[idx[:-1], :])

Unnamed: 0_level_0,testA,testA,testA,testB,testB,testB,testA+,testA+,testA+,testB+,testB+,testB+,testg,testg,testg
Unnamed: 0_level_1,TTR1,TTR2,cov.,TTR1,TTR2,cov.,TTR1,TTR2,cov.,TTR1,TTR2,cov.,TTR1,TTR2,cov.
system,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2
greedy_l-na_r-na,7.0,29.2,4.1,10.6,46.5,5.6,8.7,28.2,4.2,16.5,48.1,7.7,16.7,40.0,10.7
beam_l-na_r-na,6.6,27.8,3.6,10.3,48.6,5.2,9.4,32.7,4.1,16.4,56.4,6.9,17.5,42.4,10.4
predfuse_es_l-0-7_r-na,8.3,35.0,4.8,11.6,49.7,6.1,12.5,42.6,5.4,19.3,64.0,7.9,19.8,48.0,13.0
predfuse_es_l-0-5_r-na,11.6,46.4,6.9,13.5,54.4,7.6,16.1,54.4,7.9,22.2,70.3,9.5,22.7,56.4,16.5
predfuse_es_l-0-3_r-na,17.5,60.7,12.6,18.4,62.6,13.0,22.7,65.8,13.2,29.2,80.3,14.8,28.7,71.0,23.6
rsa_l-na_r-0-5,7.8,33.7,4.3,11.6,50.2,5.9,12.1,43.2,5.0,18.5,61.0,7.4,19.8,48.1,12.6
rsa_l-na_r-1-0,7.8,34.6,4.4,11.8,51.6,5.9,13.0,47.0,5.7,19.7,64.7,7.9,20.4,49.8,13.6
rsa_l-na_r-5-0,9.4,39.0,5.7,13.5,55.8,7.2,16.1,54.1,7.6,23.9,74.4,10.6,22.5,57.7,16.3
annsample,24.9,71.7,22.4,27.6,79.7,23.1,31.2,80.6,20.9,39.6,91.2,26.2,34.0,77.8,44.4
