# On the difficulty of a distributional semantics of speech
Grzegorz Chrupała, Ákos Kádár, Lieke Gelderloos, Afra Alishahi

# Table 1 (results on Synthetically Spoken COCO)

In [1]:
from vg.vendrov_provider import Provider
prov = Provider('coco', root='..', audio_kind='mfcc')

In [3]:
import vg.scorer as S
audio = lambda x: x['audio']
config = dict(split='val', tokenize=audio, batch_size=32)
scorer = S.Scorer(prov, config)

In [4]:
import numpy as np
import torch
seed = 666
torch.cuda.manual_seed_all(seed)
import torch.autograd as autograd
class MeanNet:
    def __init__(self):
        self.training = False
    def eval(self):
        pass
    
    def predict(self, x):
        return x.mean(dim=1)
    
class RandNet:
    def __init__(self, features=1024):
        self.training = False
        self.features = features
    
    def eval(self):
        pass
        
    def predict(self, x):
        return autograd.Variable(torch.FloatTensor(x.size(0), self.features).uniform_(-1, 1))
    

In [5]:
print("Model R@10 Medianr RSA_img")
for name, net in [("MFCC", MeanNet()), ("Chance", RandNet())]:
    ret = scorer.retrieval_para(net)
    rsa = scorer.rsa_image(net)
    print("{} {:.2} {} {:.2}".format(name, ret['recall@10'], ret['medr'], rsa['img_rep']))

nets = dict(SegMatch="../experiments/coco-audiosemi-gru-er-15-b/model.r.e12.zip",
            #Audio2vecc="../experiments/coco-audio2vecc/model.1.pkl" # this is misbehavin!
            audio2vecc="../experiments/coco-audio2vec-gru-a/model.r.e4.zip",
            audio2vecu="../experiments/coco-audio2vecu-B/model.r.e5.zip")

for name, path in nets.items():
    net = S.load(path).cuda()
    ret = scorer.retrieval_para(net)
    rsa = scorer.rsa_image(net)
    print("{} {:.2} {} {:.2}".format(name, ret['recall@10'], ret['medr'], rsa['img_rep']))

Model R@10 Medianr RSA_img
MFCC 0.0058 1413.5 0.02
Chance 0.00031 3955.0 -7.3e-05
SegMatch 0.1 37.0 0.5
audio2vecu 0.047 105.0 -0.004
audio2vecc 0.016 647.0 -0.005


## Visually grounded

In [6]:
import imaginet.defn.audiovis_rhn as audiovis
from vg.vendrov_provider import Provider
prov = Provider('coco', root='..', audio_kind='mfcc')



In [10]:
import numpy
import imaginet.task 
model_path = "/roaming/gchrupal/visually-grounded-speech/models/coco-speech.zip"
model = imaginet.task.load(model_path)
mfcc = numpy.array([s['audio'] for s in prov.iterSentences(split='val') ])
pred = audiovis.encode_sentences(model, mfcc)

In [11]:
import vg.scorer as S
audio = lambda x: x['audio']
config = dict(split='val', tokenize=audio, batch_size=32)
scorer = S.Scorer(prov, config)
scorer.pred = pred #HACKY!

In [14]:
print("Model R@10 Medianr RSA_img")
ret = scorer.retrieval_para()
rsa = scorer.rsa_image()
print("{} {:.2} {} {:.2}".format("VGS", ret['recall@10'], ret['medr'], rsa['img_rep']))

Model R@10 Medianr RSA_img
VGS 0.28 6.0 0.41
