# Grounding analysis

In [1]:
import sys 
import pandas as pd
import numpy as np
import json
from os.path import join
import pickle
from scipy.stats import pearsonr, ttest_ind, spearmanr

OUTPUT_PATH = '/Users/anna/Documents/Code/Ads/data/outputs'
sys.path.append(OUTPUT_PATH) 
from bert_utils import SentenceEmbeddings



## Textual grounding

### Original test set

Textual overlap

In [2]:
from nltk.stem import WordNetLemmatizer 
from nltk import RegexpTokenizer
from nltk.corpus import stopwords

In [3]:
with open(join(OUTPUT_PATH,'test_right.json'), 'r') as my_right_file:
    right_data=my_right_file.read()
right_dict = json.loads(right_data)


with open(join(OUTPUT_PATH,'test_wrong.json'), 'r') as my_wrong_file:
    wrong_data=my_wrong_file.read()
wrong_dict = json.loads(wrong_data)
ocr_df = pd.read_csv(join(OUTPUT_PATH, 'text_sim.csv'))

In [4]:
print('Length of the test set:', len(right_dict))
print('Test set samples with OCR-extracred text:', ocr_df.shape[0])
print(ocr_df.shape)
print('Number of null texts:', ocr_df.text.isnull().sum())
ocr_df.head()

Length of the test set: 12805
Test set samples with OCR-extracred text: 12304
(12304, 5)
Number of null texts: 0


Unnamed: 0,image_file,all_answers,wrong_answers,right_answers_update,text
0,174225.png,"['I should buy Versace Becuase its bright', 'I...",['I should buy Versace Becuase its bright'\n '...,"['I should watch the amount I drink, because t...",BEER BEE AGE bac OUT DEPEN IM iVer disease esi...
1,88925.jpg,"[""I should buy from the Farmer's Market Becaus...","[""I should buy from the Farmer's Market Becaus...",['I should have this car because it is nice an...,L INC OL N CONTINENTA L
2,173505.png,['I should use mySIM Plans because they are mo...,['I should use Boost Mobile Because it is a c...,['I should use mySIM Plans because they are mo...,mySIM Plans and Phone Deal:s Enjoy even greate...
3,173522.png,['I should go grocery shopping at Meijer Becau...,['I should go grocery shopping at Meijer Becau...,['I should buy Dior cosmetics because they wil...,ior DIORSHOW ICONIC OVERCURL
4,92004.jpg,"[""I should buy Hershey's kisses Because they'r...","[""I should buy Hershey's kisses Because they'r...",['I should buy this Manavox tv Because I like ...,NEW! COMPUTER COLOR FROM MAGNAVOX TOUCH TUNE E...


In [5]:
lemmatizer = WordNetLemmatizer()
tokenizer = RegexpTokenizer(r'\w+')
stop_words = set(stopwords.words('english'))

def preprocess_text(text, lemmatizer=lemmatizer, tokenizer=tokenizer, stopwords=stop_words):

    tokens = tokenizer.tokenize(text)
    lemma_text = [lemmatizer.lemmatize(t.lower()) for t in tokens if t not in stop_words]

    return lemma_text

def compute_overlap(ocr_set, ar):
    intersection = 0
    if len(ar) == 0:
        return 0
    for el in ar:
        if el in ocr_set:
            intersection += 1
    
    return intersection/len(ar)

def compute_scores(dataframe, ar_dict):
    scores = []
    for i in range(len(dataframe)):
        im = dataframe.image_file[i]
        ocr_set = set(preprocess_text(dataframe.text[i]))
        for ar in ar_dict[im]:
            scores.append(compute_overlap(ocr_set, preprocess_text(ar)))
    
    return np.array(scores)


In [6]:
scores_right = compute_scores(ocr_df, right_dict)
scores_wrong = compute_scores(ocr_df, wrong_dict)
print(scores_right.mean())
print(scores_wrong.mean())
ttest_ind(scores_right, scores_wrong)

0.21112030031086035
0.02799327879886783


TtestResult(statistic=295.64930142873544, pvalue=0.0, df=184558.0)

In [7]:
text_overlap_orig = scores_right

In [7]:
# clip scores

clip_outs_right = pickle.load(open(join(OUTPUT_PATH, 'clip','clip-vit-large-patch14-336_clip_score_test_right.pkl'), "rb"))
clip_outs_wrong = pickle.load(open(join(OUTPUT_PATH, 'clip','clip-vit-large-patch14-336_clip_score_test_wrong.pkl'), "rb"))

def get_clip_scores(dataframe, clip_outputs):
    im_paths = dataframe.image_file.values.tolist()
    clip_sim = np.array([])
    for path in im_paths:
        logits = clip_outputs.out_dict[path]['logits']
        clip_sim = np.concatenate((clip_sim, logits), axis=0)
    
    return clip_sim

clip_scores_right = get_clip_scores(ocr_df, clip_outs_right)
clip_scores_wrong = get_clip_scores(ocr_df, clip_outs_wrong)

print(clip_scores_right.mean())
print(clip_scores_wrong.mean())
ttest_ind(clip_scores_right, clip_scores_wrong)


23.779806262659946
12.658216666904


TtestResult(statistic=444.2081415327576, pvalue=0.0, df=184558.0)

In [8]:
print('Overall correlation:', spearmanr(np.concatenate((scores_wrong, scores_right), axis=0), 
                                           np.concatenate((clip_scores_wrong, clip_scores_right)), 
                                           alternative='two-sided'))

Overall correlation: SignificanceResult(statistic=0.45245768573873035, pvalue=0.0)


Sentence-level similarity

In [8]:
ocr_outs = pickle.load(open(join(OUTPUT_PATH, 'mpnet', 'all-mpnet-base-v2_ocr_text.pkl'), "rb"))
right_outs = pickle.load(open(join(OUTPUT_PATH, 'mpnet', 'all-mpnet-base-v2_corr_ar.pkl'), "rb"))
wrong_outs = pickle.load(open(join(OUTPUT_PATH,'mpnet', 'all-mpnet-base-v2_wrong_ar.pkl'), "rb"))

print(ocr_outs.embeddings.shape)
print(wrong_outs.embeddings.shape)
print(right_outs.embeddings.shape)

(12085, 768)
(37548, 768)
(38734, 768)


In [9]:
def compute_sim(ocr_t, image_path, ar_dict, ocr_outputs, ar_outputs):
    ar_list = ar_dict[image_path]
    n = len(ar_list)
    # print(n)
    res = np.zeros(n)
    
    for i in range(n):
        res[i] = pearsonr(ocr_outputs(ocr_t), ar_outputs(ar_list[i]))[0]
    
    # print(res)

    return res

def get_sim_scores(dataframe, ar_dict, ocr_outputs, ar_outputs):
    sim_scores = np.zeros(0)
    for i in range(len(dataframe)):
        text = dataframe.text[i]
        path = dataframe.image_file[i]
        sim = compute_sim(ocr_t=text, image_path=path, ar_dict=ar_dict, ocr_outputs=ocr_outputs, ar_outputs=ar_outputs)
        sim_scores = np.concatenate((sim_scores, sim), axis=0)

    return sim_scores


In [10]:
scores_right = get_sim_scores(ocr_df, right_dict, ocr_outs, right_outs)
scores_wrong = get_sim_scores(ocr_df, wrong_dict, ocr_outs, wrong_outs)
print(scores_right.mean())
print(scores_wrong.mean())
ttest_ind(scores_right, scores_wrong)

0.4056594954126213
0.1292979560786269


TtestResult(statistic=343.60494711862356, pvalue=0.0, df=184558.0)

In [11]:
text_sim_orig = scores_right

In [20]:
print('Overall correlation:', spearmanr(np.concatenate((scores_wrong, scores_right), axis=0), 
                                           np.concatenate((clip_scores_wrong, clip_scores_right)), 
                                           alternative='two-sided'))

Overall correlation: SignificanceResult(statistic=0.6109834865289566, pvalue=0.0)


### Our dataset

In [12]:
from nltk.stem import WordNetLemmatizer 
from nltk import RegexpTokenizer
from nltk.corpus import stopwords
df = pd.read_csv(join(OUTPUT_PATH, 'dist_w_ocr.csv'))
df = df[~df.text.isnull()].reset_index()
print(df.shape)
print(df.columns)

(294, 8)
Index(['index', 'image_path', 'distractor_1', 'distractor_2', 'flag', 'ar',
       'annotator_id', 'text'],
      dtype='object')


CLIP scores

In [27]:
print(df.shape)
clip_our_dataset = pickle.load(open(join(OUTPUT_PATH, 'clip','clip-vit-large-patch14-336_clip_score_our_dataset.pkl'), "rb"))
clip_scores_right = np.array([clip_our_dataset.out_dict[df.image_path[i]]['logits'][0] for i in range(len(df))])
clip_scores_wrong = np.array([clip_our_dataset.out_dict[df.image_path[i]]['logits'][1:] for i in range(len(df))]).flatten()
print(clip_scores_right.mean())
print(clip_scores_wrong.mean())
ttest_ind(clip_scores_right, clip_scores_wrong)

(294, 8)
24.86613
24.420742


TtestResult(statistic=1.2235935691912778, pvalue=0.22143311080394262, df=880.0)

Word overlap

In [13]:
lemmatizer = WordNetLemmatizer()
tokenizer = RegexpTokenizer(r'\w+')
stop_words = set(stopwords.words('english'))

def preprocess_text(lemmatizer, tokenizer, text):

    tokens = tokenizer.tokenize(text)
    lemma_text = [lemmatizer.lemmatize(t.lower()) for t in tokens if t not in stop_words]

    return lemma_text

def compute_overlap(ocr_set, ar):
    intersection = 0
    for el in ar:
        if el in ocr_set:
            intersection += 1
    
    return intersection/len(ar)

def compute_scores_right(dataframe):
    scores = []
    for i in range(len(dataframe)):
        
        ocr_set = set(preprocess_text(lemmatizer, tokenizer, dataframe.text[i]))
        scores.append(compute_overlap(ocr_set, preprocess_text(lemmatizer, tokenizer, dataframe.ar[i])))
    
    return np.array(scores)

def compute_scores_wrong(dataframe):
    scores = []
    for i in range(len(dataframe)):
        
        ocr_set = set(preprocess_text(lemmatizer, tokenizer, dataframe.text[i]))
        scores.append(compute_overlap(ocr_set, preprocess_text(lemmatizer, tokenizer, dataframe.distractor_1[i])))
        scores.append(compute_overlap(ocr_set, preprocess_text(lemmatizer, tokenizer, dataframe.distractor_2[i])))
    
    return np.array(scores)


In [14]:
scores_right = compute_scores_right(df)
scores_wrong = compute_scores_wrong(df)
print(scores_right.mean(), scores_right.shape)
print(scores_wrong.mean(), scores_wrong.shape)
ttest_ind(scores_right, scores_wrong, alternative='two-sided')

0.2706212686744373 (294,)
0.3139828331154862 (588,)


TtestResult(statistic=-2.835571609692304, pvalue=0.0046792512242782944, df=880.0)

In [15]:
text_overl_trade = scores_right

In [13]:
print('Overall correlation:', spearmanr(np.concatenate((scores_wrong, scores_right), axis=0), 
                                           np.concatenate((clip_scores_wrong, clip_scores_right)), 
                                           alternative='two-sided'))

Overall correlation: SignificanceResult(statistic=0.2655390641506729, pvalue=1.0632978378600645e-15)


Semantic similarity with sentence embeddings

In [16]:
df = pd.read_csv(join(OUTPUT_PATH, 'dist_w_ocr.csv'))
df = df[~df.text.isnull()].reset_index()
print(df.shape)
ocr_outs = pickle.load(open(join(OUTPUT_PATH, 'mpnet', 'all-mpnet-base-v2_ocr_text_our_distractors.pkl'), "rb"))
right_outs = pickle.load(open(join(OUTPUT_PATH, 'mpnet', 'all-mpnet-base-v2_corr_ar_our_distractors.pkl'), "rb"))
wrong_outs = pickle.load(open(join(OUTPUT_PATH,'mpnet', 'all-mpnet-base-v2_wrong_ar_our_distractors.pkl'), "rb"))


(294, 8)


In [17]:
def compute_sim(dataframe, ocr_outputs, wrong_outputs):
    ret = np.array([])
    for i in range(len(dataframe)):
        new_arr = np.zeros(2)
        new_arr[0] = pearsonr(wrong_outputs(dataframe.distractor_1[i]), ocr_outputs(dataframe.text[i]))[0]
        new_arr[1] = pearsonr(wrong_outputs(dataframe.distractor_2[i]), ocr_outputs(dataframe.text[i]))[0]
        ret = np.concatenate((ret, new_arr))

    return ret


scores_right = df.apply(lambda x: pearsonr(ocr_outs(x.text), right_outs(x.ar))[0], axis=1)
scores_wrong = compute_sim(df, ocr_outs, wrong_outs)
print(scores_right.mean())
print(scores_wrong.mean())
ttest_ind(scores_right, scores_wrong)

0.44220495456887843
0.42110715510001195


TtestResult(statistic=1.7155641215005741, pvalue=0.08659384747839889, df=880.0)

In [18]:
text_sim_trade = scores_right

In [62]:
print('Overall correlation:', spearmanr(np.concatenate((scores_wrong, scores_right), axis=0), 
                                           np.concatenate((clip_scores_wrong, clip_scores_right)), 
                                           alternative='two-sided'))

Overall correlation: SignificanceResult(statistic=0.40992078846339486, pvalue=4.578079555696565e-37)


## Image-based grounding

### Original dataset

Simple overlap

In [19]:
import nltk

with open(join(OUTPUT_PATH,'test_right.json'), 'r') as my_right_file:
    right_data=my_right_file.read()
right_dict = json.loads(right_data)


with open(join(OUTPUT_PATH,'test_wrong.json'), 'r') as my_wrong_file:
    wrong_data=my_wrong_file.read()
wrong_dict = json.loads(wrong_data)
ocr_df = pd.read_csv(join(OUTPUT_PATH, 'text_sim.csv'))

obj_outputs = pickle.load(open(join(OUTPUT_PATH, 'det2','short_mask_rcnn_R_50_FPN_3x.pkl'), "rb"))

print(len(obj_outputs['objects']))

l = []
red_dict = {}
for k in obj_outputs['objects']:
    if len(obj_outputs['objects'][k]) > 0:
        red_dict[k] = obj_outputs['objects'][k]
        l.append(len(obj_outputs['objects'][k]))
print(f'Images were detected for {len(red_dict)} objects')
print(f'An average of {np.round(np.array(l).mean(),2)} ({np.round(np.array(l).std(),2)}) objects was detected per image')

12805
Images were detected for 11351 objects
An average of 3.74 (3.85) objects was detected per image


In [20]:
classes = obj_outputs['classes']
cl_arr = np.array([nltk.pos_tag([t])[0][1] for t in classes])
np.unique(cl_arr)

array(['JJ', 'NN', 'NNS', 'RB'], dtype='<U3')

In [21]:
def clean_text(sentence):
	tokenizer = RegexpTokenizer(r'\w+')
	lemmatizer = WordNetLemmatizer()
	pos_tags = nltk.pos_tag(tokenizer.tokenize(sentence))
	ret_s = [lemmatizer.lemmatize(el[0].lower()) for el in pos_tags if el[1] in ['NN', 'NNP', 'NNS']]
		
	return ret_s

def get_classes_list(class_names, class_numbers):
    ret = [class_names[i] for i in class_numbers]
    return ret


classes = obj_outputs['classes']
right_ar = []
right_obj = []
wrong_ar = []
wrong_obj = []
for k in red_dict:
    for ar in right_dict[k]:
        right_ar.append(clean_text(ar))
        right_obj.append(get_classes_list(classes, red_dict[k]))
    for ar in wrong_dict[k]:
        wrong_ar.append(clean_text(ar))
        wrong_obj.append(get_classes_list(classes, red_dict[k]))

print(len(right_ar))
print(len(right_obj))
print(len(wrong_ar))
print(len(wrong_obj))

35617
35617
134648
134648


In [22]:
# original setup
classes = obj_outputs['classes']
right_ar = []
right_obj = []
wrong_ar = []
wrong_obj = []
for k in red_dict:
    for ar in right_dict[k]:
        right_ar.append(clean_text(ar))
        right_obj.append(get_classes_list(classes, red_dict[k]))
    for ar in wrong_dict[k]:
        wrong_ar.append(clean_text(ar))
        wrong_obj.append(get_classes_list(classes, red_dict[k]))

print(len(right_ar))
print(len(right_obj))
print(len(wrong_ar))
print(len(wrong_obj))

35617
35617
134648
134648


In [23]:
def compute_overlap(obj_set, ar_list):
    
    if len(ar_list) == 0:
        return 0
    intersection = 0
    for el in ar_list:
        if el in obj_set:
            intersection += 1
    
    return intersection/len(ar_list)

overl_right = np.array([compute_overlap(set(right_obj[i]), right_ar[i] ) for i in range(len(right_ar))])
overl_wrong = np.array([compute_overlap(set( wrong_obj[i]), wrong_ar[i]) for i in range(len(wrong_ar))])
print(overl_right.mean())
print(overl_wrong.mean())
ttest_ind(overl_right, overl_wrong)


0.027336710966017193
0.008844132064306147


TtestResult(statistic=36.09690184426529, pvalue=3.041267010948811e-284, df=170263.0)

In [36]:
def get_clip_scores(ar_dict, clip_outputs):
    im_paths = list(ar_dict.keys())
    clip_sim = np.array([])
    for path in im_paths:
        logits = clip_outputs.out_dict[path]['logits']
        clip_sim = np.concatenate((clip_sim, logits), axis=0)
    
    return clip_sim
        
red_right_dict = {k: right_dict[k] for k in right_dict if k in red_dict}
red_wrong_dict = {k: wrong_dict[k] for k in wrong_dict if k in red_dict}
clip_outs_right = pickle.load(open(join(OUTPUT_PATH, 'clip','clip-vit-large-patch14-336_clip_score_test_right.pkl'), "rb"))
clip_outs_wrong = pickle.load(open(join(OUTPUT_PATH, 'clip','clip-vit-large-patch14-336_clip_score_test_wrong.pkl'), "rb"))
clip_scores_right = get_clip_scores(red_right_dict, clip_outs_right)
clip_scores_wrong = get_clip_scores(red_wrong_dict, clip_outs_wrong)
print(clip_scores_right.mean())
print(clip_scores_wrong.mean())
print(ttest_ind(clip_scores_right, clip_scores_wrong))

print('Overall correlation:', spearmanr(np.concatenate((overl_wrong, overl_right), axis=0), 
                                           np.concatenate((clip_scores_wrong, clip_scores_right)), 
                                           alternative='two-sided'))

23.720198524166612
12.737644326339097
TtestResult(statistic=422.74555826261627, pvalue=0.0, df=170263.0)
Overall correlation: SignificanceResult(statistic=0.1377801591338042, pvalue=0.0)


In [24]:
obj_mention_orig = overl_right

Captions

In [25]:
df = pd.read_csv(join(OUTPUT_PATH, 'text_sim.csv'))
captions = pickle.load(open(join(OUTPUT_PATH, 'blip2', 'blip2-opt-2.7b_blip_captions_wrong.pkl'), "rb"))
capt_outs = pickle.load(open(join(OUTPUT_PATH, 'mpnet', 'all-mpnet-base-v2_caption_embeddings.pkl'), "rb"))
right_outs = pickle.load(open(join(OUTPUT_PATH, 'mpnet', 'all-mpnet-base-v2_corr_ar.pkl'), "rb"))
wrong_outs = pickle.load(open(join(OUTPUT_PATH,'mpnet', 'all-mpnet-base-v2_wrong_ar.pkl'), "rb"))
clip_outs_right = pickle.load(open(join(OUTPUT_PATH, 'clip','clip-vit-large-patch14-336_clip_score_test_right.pkl'), "rb"))
clip_outs_wrong = pickle.load(open(join(OUTPUT_PATH, 'clip','clip-vit-large-patch14-336_clip_score_test_wrong.pkl'), "rb"))

In [27]:
def compute_sim(cap_t, image_path, ar_dict, cap_outputs, ar_outputs):
    ar_list = ar_dict[image_path]
    n = len(ar_list)
    # print(n)
    res = np.zeros(n)
    
    for i in range(n):
        res[i] = pearsonr(cap_outputs(cap_t), ar_outputs(ar_list[i]))[0]
    
    # print(res)

    return res

def get_sim_scores(key_list, captions, ar_dict, cap_outputs, ar_outputs):
    sim_scores = np.zeros(0)
    for i in range(len(key_list)):
     
        path = key_list[i]
        text = captions['outputs'][path]['caption']
        sim = compute_sim(cap_t=text, image_path=path, ar_dict=ar_dict, cap_outputs=cap_outputs, ar_outputs=ar_outputs)
        sim_scores = np.concatenate((sim_scores, sim), axis=0)

    return sim_scores


In [28]:
scores_right = get_sim_scores(list(right_dict.keys()), captions, right_dict, capt_outs, right_outs)
scores_wrong = get_sim_scores(list(right_dict.keys()), captions, wrong_dict, capt_outs, wrong_outs)
print(scores_right.mean())
print(scores_wrong.mean())
ttest_ind(scores_right, scores_wrong)


0.3159710148179032
0.10862949894945366


TtestResult(statistic=276.47171918205925, pvalue=0.0, df=192073.0)

In [29]:
cap_sim_orig = scores_right

In [40]:
def get_clip_scores(key_list, clip_outputs):
    clip_sim = np.array([])
    for path in key_list:
        logits = clip_outputs.out_dict[path]['logits']
        clip_sim = np.concatenate((clip_sim, logits), axis=0)
    
    return clip_sim

clip_scores_right = get_clip_scores(list(right_dict.keys()), clip_outs_right)
clip_scores_wrong = get_clip_scores(list(wrong_dict.keys()), clip_outs_wrong)

print(clip_scores_right.mean())
print(clip_scores_wrong.mean())
print(ttest_ind(clip_scores_right, clip_scores_wrong))

print('Overall correlation:', spearmanr(np.concatenate((scores_wrong, scores_right), axis=0), 
                                           np.concatenate((clip_scores_wrong, clip_scores_right)), 
                                           alternative='two-sided'))

23.72006608930263
12.680320747900193
TtestResult(statistic=450.6241275160643, pvalue=0.0, df=192073.0)
Overall correlation: SignificanceResult(statistic=0.5276868498200121, pvalue=0.0)


### Our dataset

CLIP scores

In [30]:
df = pd.read_csv(join(OUTPUT_PATH, 'dist_w_ocr.csv'))
print(df.shape)
clip_our_dataset = pickle.load(open(join(OUTPUT_PATH, 'clip','clip-vit-large-patch14-336_clip_score_our_dataset.pkl'), "rb"))
clip_scores_right = np.array([clip_our_dataset.out_dict[df.image_path[i]]['logits'][0] for i in range(len(df))])
clip_scores_wrong = np.array([clip_our_dataset.out_dict[df.image_path[i]]['logits'][1:] for i in range(len(df))]).flatten()
print(clip_scores_right.mean())
print(clip_scores_wrong.mean())
print(ttest_ind(clip_scores_right, clip_scores_wrong))


(300, 7)
24.842833
24.385693
TtestResult(statistic=1.2651321675697134, pvalue=0.20615213969344062, df=898.0)


Lemma overlap

In [31]:
# our dataset
obj_outputs = pickle.load(open(join(OUTPUT_PATH, 'det2','ours_slurm_mask_rcnn_R_50_FPN_3x.pkl'), "rb"))

print(len(obj_outputs))

l = []
red_dict = {}
for k in obj_outputs:
    if len(obj_outputs[k]) > 0:
        red_dict[k] = obj_outputs[k]['pred_classes']
        l.append(len(obj_outputs[k]['pred_classes']))
print(f'Images were detected for {len(red_dict)} objects')
print(f'An average of {np.round(np.array(l).mean(),2)} ({np.round(np.array(l).std(),2)}) objects was detected per image')

300
Images were detected for 300 objects
An average of 3.51 (3.38) objects was detected per image


In [32]:
def clean_text(sentence):
	tokenizer = RegexpTokenizer(r'\w+')
	lemmatizer = WordNetLemmatizer()
	pos_tags = nltk.pos_tag(tokenizer.tokenize(sentence))
	ret_s = [lemmatizer.lemmatize(el[0].lower()) for el in pos_tags if el[1] in ['NN', 'NNP', 'NNS']]
		
	return ret_s

def compute_overlap(obj_set, ar_list):
    
    if len(ar_list) == 0:
        return 0
    intersection = 0
    for el in ar_list:
        if el in obj_set:
            intersection += 1
    
    return intersection/len(ar_list)

right_ar = []
right_obj = []
wrong_ar = []
wrong_obj = []
for i in range(len(df)):
    im = df.image_path[i]
    right_ar.append(clean_text(df.ar[i]))
    right_obj.append(get_classes_list(classes, red_dict[im]))
    wrong_ar.append(clean_text(df.distractor_1[i]))
    wrong_ar.append(clean_text(df.distractor_2[i]))
    wrong_obj.append(get_classes_list(classes, red_dict[im]))
    wrong_obj.append(get_classes_list(classes, red_dict[im]))


overl_right = np.array([compute_overlap(set(right_obj[i]), right_ar[i] ) for i in range(len(right_ar))])
overl_wrong = np.array([compute_overlap(set( wrong_obj[i]), wrong_ar[i]) for i in range(len(wrong_ar))])
print(overl_right.mean(), overl_right.shape)
print(overl_wrong.mean(), overl_wrong.shape)
ttest_ind(overl_right, overl_wrong)


0.022809523809523807 (300,)
0.03536574074074073 (600,)


TtestResult(statistic=-1.4576480091118431, pvalue=0.14528719856443884, df=898.0)

In [33]:
obj_mention_trade = overl_right

In [51]:
print(spearmanr(np.concatenate((overl_wrong, overl_right), axis=0), 
                                           np.concatenate((clip_scores_wrong, clip_scores_right)), 
                                           alternative='two-sided'))

SignificanceResult(statistic=0.04066508794897715, pvalue=0.22293479751555514)


Captions

In [34]:
right_outs = pickle.load(open(join(OUTPUT_PATH, 'mpnet', 'all-mpnet-base-v2_corr_ar_our_distractors.pkl'), "rb"))
wrong_outs = pickle.load(open(join(OUTPUT_PATH,'mpnet', 'all-mpnet-base-v2_wrong_ar_our_distractors.pkl'), "rb"))
captions = pickle.load(open(join(OUTPUT_PATH, 'blip2', 'blip2-opt-2.7b_blip_captions.pkl'), "rb"))
capt_outs = pickle.load(open(join(OUTPUT_PATH, 'mpnet', 'all-mpnet-base-v2_caption_embeddings_ours.pkl'), "rb"))
df = pd.read_csv(join(OUTPUT_PATH, 'dist_w_ocr.csv'))

In [35]:
def compute_sim(dataframe, captions, capt_outputs, wrong_outputs):
    ret = np.array([])
    for i in range(len(dataframe)):
        new_arr = np.zeros(2)
    
        new_arr[0] = pearsonr(wrong_outputs(dataframe.distractor_1[i]), capt_outputs(captions['outputs'][dataframe.image_path[i]]['caption']))[0]
        new_arr[1] = pearsonr(wrong_outputs(dataframe.distractor_2[i]), capt_outputs(captions['outputs'][dataframe.image_path[i]]['caption']))[0]
        ret = np.concatenate((ret, new_arr))

    return ret


scores_right = df.apply(lambda x: pearsonr(capt_outs(captions['outputs'][x.image_path]['caption']), right_outs(x.ar))[0], axis=1)
scores_wrong = compute_sim(df,captions, capt_outs, wrong_outs)
print(scores_right.mean(), scores_right.shape)
print(scores_wrong.mean(), scores_right.shape)
ttest_ind(scores_right, scores_wrong)

0.34228443398276237 (300,)
0.346720127807326 (300,)


TtestResult(statistic=-0.36493185801230943, pvalue=0.7152481990220099, df=898.0)

In [36]:
cap_sim_trade = scores_right

In [54]:
print(spearmanr(np.concatenate((scores_wrong, scores_right), axis=0), 
                                           np.concatenate((clip_scores_wrong, clip_scores_right)), 
                                           alternative='two-sided'))

SignificanceResult(statistic=0.29854899409340835, pvalue=5.507669434536161e-20)


In [37]:
print(text_overlap_orig.shape)
print(text_overl_trade.shape)
print(text_sim_orig.shape)
print(text_sim_trade.shape)
print(obj_mention_orig.shape)
print(obj_mention_trade.shape)
print(cap_sim_orig.shape)
print(cap_sim_trade.shape)

(38596,)
(294,)
(38596,)
(294,)
(35617,)
(300,)
(40171,)
(300,)


In [38]:
print(text_overlap_orig.mean())
print(text_overl_trade.mean())
print(text_sim_orig.mean())
print(text_sim_trade.mean())
print(obj_mention_orig.mean())
print(obj_mention_trade.mean())
print(cap_sim_orig.mean())
print(cap_sim_trade.mean())

0.21112030031086035
0.2706212686744373
0.4056594954126213
0.44220495456887843
0.027336710966017193
0.022809523809523807
0.3159710148179032
0.34228443398276237


In [39]:
pickle.dump({'text_overlap_orig': text_overlap_orig,
         'text_overlap_trade': text_overl_trade,
         'text_sim_orig': text_sim_orig,
         'text_sim_trade': text_sim_trade,
         'obj_mention_origin': obj_mention_orig,
         'obj_mention_trade': obj_mention_trade,
         'cap_sim_origin': cap_sim_orig,
         'cap_sim_trade': cap_sim_trade}, open('grounding_scores.pkl','wb'))