<a href="https://colab.research.google.com/github/vgaquino/capstone-team-29/blob/main/model_evaluation_kosmos.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import argparse
import json
from datetime import datetime
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
import zipfile
import itertools

from PIL import Image
from transformers import AutoProcessor, AutoModelForVision2Seq, Blip2Processor, Blip2ForConditionalGeneration
import torch

from google.colab import drive
drive.mount('/content/drive')

nltk.download('punkt')
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

pd.set_option('display.max_colwidth', None)

Mounted at /content/drive


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


1. Tokenize ground truth captions

In [None]:
#extract annotations (val.json, train.json, test.json)
zip_files = ['/content/drive/MyDrive/Colab Notebooks/annotations.zip']

#destination folder for extracted files
destination_folder = '/content/'

#create the destination folder if it doesn't exist
os.makedirs(destination_folder, exist_ok=True)

#loop through each zip file
for zip_file in zip_files:
    with zipfile.ZipFile(zip_file, 'r') as zip_ref:
        #extract the contents of the zip file to the destination folder
        zip_ref.extractall(destination_folder)

In [None]:
with open('annotations/val.json', 'r') as f:
    annotation_data = json.load(f)

annotation_data.keys()

dict_keys(['info', 'images', 'annotations'])

There are 7750 images in the validation set which we could use to evaluate all three models.

In [None]:
caption_list = []
id_list = []
for captions in annotation_data['annotations']:
    caption_list.append(captions['caption'])
    id_list.append(captions['image_id'])

In [None]:
df_reference = pd.DataFrame(data={'caption': caption_list, 'image_id': id_list})

In [None]:
# df_sample['tokens'] = nltk.tokenize.word_tokenize(df_sample['caption'].str.lower())
# df_sample['tokens'] = df_sample.apply(lambda row: nltk.word_tokenize(row['caption'].lower()), axis=1)
df_reference.loc[:,'tokens'] = df_reference['caption'].str.lower().apply(nltk.word_tokenize)
# df["unigrams"] = df["verbatim"].apply(nltk.word_tokenize)

In [None]:
df_reference['tokens'].head()

0               [a, computer, screen, shows, a, repair, prompt, on, the, screen, .]
1                    [a, computer, screen, with, a, repair, automatically, pop, up]
2                      [partial, computer, screen, showing, the, need, of, repairs]
3        [part, of, a, computer, monitor, showing, a, computer, repair, message, .]
4    [the, top, of, a, laptop, with, a, blue, background, and, dark, blue, text, .]
Name: tokens, dtype: object

In [None]:
# df_sample.loc[:, 'bi-gram'] = df_sample['tokens'].apply(nltk.ngrams(n=2))
# df_sample.loc[:,'tokens'] = df_sample['caption'].str.lower().apply(nltk.ngrams)
# df.apply(lambda row: nltk.word_tokenize(row['sentences']), axis=1)
df_sample.loc[:, 'bi-grams'] = df_sample['tokens'].apply(lambda row: list(nltk.ngrams(row, n=2)))

In [None]:
df_reference['image_id'] = df_reference['image_id'] - 23431 # adjust image id and image name

In [None]:
baseline_scores = {}
for image_id in set(df['image_id']):
  indexes = df.index[df['image_id'] == image_id].tolist()
  image_scores = []
  for i in indexes:
    indexes = df.index[df['image_id'] == image_id].tolist()
    indexes.remove(i)
    # print(list(df_sample.iloc[indexes]['tokens']))
    # print(df_sample['tokens'][i])
    bleu_score = nltk.translate.bleu_score.sentence_bleu(df.iloc[indexes]['tokens'], df['tokens'][i], weights=(0.5,0.5))
    image_scores.append(bleu_score)
  ave_score = round(sum(image_scores)/len(image_scores),4)
  baseline_scores[image_id] = ave_score

The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


In [None]:
baseline_scores

{0: 0.3783,
 1: 0.3805,
 2: 0.4662,
 3: 0.4165,
 4: 0.246,
 5: 0.5528,
 6: 1.0,
 7: 0.6,
 8: 0.7181,
 9: 0.2553,
 10: 0.3405,
 11: 0.2565,
 12: 0.0832,
 13: 0.8,
 14: 0.8,
 15: 0.1769,
 16: 0.2261,
 17: 0.3101,
 18: 0.4457,
 19: 0.2885,
 20: 0.1049,
 21: 0.0618,
 22: 0.1427,
 23: 0.441,
 24: 0.412,
 25: 0.1485,
 26: 0.3926,
 27: 1.0,
 28: 0.4124,
 29: 0.0,
 30: 0.2099,
 31: 0.3637,
 32: 0.3901,
 33: 0.5833,
 34: 0.2986,
 35: 0.4907,
 36: 0.3368,
 37: 0.0,
 38: 0.8,
 39: 0.1567,
 40: 0.3265,
 41: 0.4317,
 42: 0.563,
 43: 0.4,
 44: 0.2257,
 45: 0.4679,
 46: 0.3485,
 47: 0.2818,
 48: 0.4611,
 49: 0.1759,
 50: 0.0916,
 51: 0.2604,
 52: 0.0,
 53: 0.3403,
 54: 0.383,
 55: 0.1794,
 56: 0.1116,
 57: 0.3196,
 58: 0.3821,
 59: 0.2106,
 60: 0.4786,
 61: 0.1998,
 62: 0.4792,
 63: 0.5633,
 64: 0.7042,
 65: 0.0856,
 66: 0.1902,
 67: 0.4072,
 68: 0.4234,
 69: 0.2919,
 70: 0.2124,
 71: 0.2827,
 72: 0.3248,
 73: 0.3113,
 74: 0.075,
 75: 0.2022,
 76: 0.5763,
 77: 0.4106,
 78: 0.0781,
 79: 0.2207,
 80: 0

In [None]:
with open('/content/drive/MyDrive/Colab Notebooks/BLEU_val_baseline_scores.json', 'w') as f:
    json.dump(baseline_scores, f, indent=4)

In [None]:
model = AutoModelForVision2Seq.from_pretrained("microsoft/kosmos-2-patch14-224")
processor = AutoProcessor.from_pretrained("microsoft/kosmos-2-patch14-224")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/4.45k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/6.66G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/534 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/191k [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/4.70M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/32.0k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/1.06k [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
def kosmos2_generate_caption(image_id, model, processor):
  prompt = "<grounding>An image of"
  image = Image.open('/content/drive/MyDrive/Colab Notebooks/val/VizWiz_val_{}.jpg'.format(str(image_id).zfill(8)))

  # Process image
  inputs = processor(text=prompt, images=image, return_tensors="pt")

  generated_ids = model.generate(
  pixel_values=inputs["pixel_values"],
  input_ids=inputs["input_ids"],
  attention_mask=inputs["attention_mask"],
  image_embeds=None,
  image_embeds_position_mask=inputs["image_embeds_position_mask"],
  use_cache=True,
  max_new_tokens=128,
  )

  generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]

  processed_text = processor.post_process_generation(generated_text, cleanup_and_extract=False)
  processed_text, entities = processor.post_process_generation(generated_text)

  caption = processed_text[12:] # remove prompt text "An image of "
  entity_list = [entity[0] for entity in entities]

  output_path = '/content/drive/My Drive/Colab Notebooks/kosmos_val_outputs.csv'
  df=pd.read_csv(output_path)

  new_row = {'image_id': image_id, 'caption': caption, 'entity_list': entity_list}
  df_new_row = pd.DataFrame(new_row)
  df = pd.concat([df, df_new_row], ignore_index = True)

  df.to_csv(output_path, index=False)

  print('VizWiz_val_{}.jpg processed'.format(image_id))

In [None]:
output_path = '/content/drive/My Drive/Colab Notebooks/kosmos_val_outputs.csv'
df_kosmos = pd.read_csv(output_path)
df_kosmos_cleaned = df_kosmos[0:0]
df.to_csv(output_path, index=False)

In [None]:
for image_id in range(1447,7750):
  kosmos2_generate_caption(image_id, model, processor)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
VizWiz_val_2671.jpg processed
VizWiz_val_2672.jpg processed
VizWiz_val_2673.jpg processed
VizWiz_val_2674.jpg processed
VizWiz_val_2675.jpg processed
VizWiz_val_2676.jpg processed
VizWiz_val_2677.jpg processed
VizWiz_val_2678.jpg processed
VizWiz_val_2679.jpg processed
VizWiz_val_2680.jpg processed
VizWiz_val_2681.jpg processed
VizWiz_val_2682.jpg processed
VizWiz_val_2683.jpg processed
VizWiz_val_2684.jpg processed
VizWiz_val_2685.jpg processed
VizWiz_val_2686.jpg processed
VizWiz_val_2687.jpg processed
VizWiz_val_2688.jpg processed
VizWiz_val_2689.jpg processed
VizWiz_val_2690.jpg processed
VizWiz_val_2691.jpg processed
VizWiz_val_2692.jpg processed
VizWiz_val_2693.jpg processed
VizWiz_val_2694.jpg processed
VizWiz_val_2695.jpg processed
VizWiz_val_2696.jpg processed
VizWiz_val_2697.jpg processed
VizWiz_val_2698.jpg processed
VizWiz_val_2699.jpg processed
VizWiz_val_2700.jpg processed
VizWiz_val_2701.jpg processed
VizWi

In [None]:
sentense = 'a message that says "You must repair the computer automatically'
sentense_tokens = nltk.tokenize.word_tokenize(sentense.lower())
sentense_tokens

['a',
 'message',
 'that',
 'says',
 '``',
 'you',
 'must',
 'repair',
 'the',
 'computer',
 'automatically']

In [None]:
sentense = 'a message that says "You must repair the computer automatically'
sentense_tokens = nltk.tokenize.word_tokenize(sentense.lower())

sentence_no_stop_words = [w for w in sentense_tokens if not w in stop_words]
filtered_sentence = [w for w in sentence_no_stop_words if w.isalpha()]

filtered_sentence

['message', 'says', 'must', 'repair', 'computer', 'automatically']

In [None]:
output_path = '/content/drive/My Drive/Colab Notebooks/kosmos_val_outputs.csv'
df = pd.read_csv(output_path)

In [None]:
df_captions = df[['image_id','caption']].drop_duplicates()
print(len(df_captions))

7750


In [None]:
df_reference.head()

Unnamed: 0,caption,image_id,tokens
0,A computer screen shows a repair prompt on the screen.,23431,"[a, computer, screen, shows, a, repair, prompt, on, the, screen, .]"
1,a computer screen with a repair automatically pop up,23431,"[a, computer, screen, with, a, repair, automatically, pop, up]"
2,partial computer screen showing the need of repairs,23431,"[partial, computer, screen, showing, the, need, of, repairs]"
3,Part of a computer monitor showing a computer repair message.,23431,"[part, of, a, computer, monitor, showing, a, computer, repair, message, .]"
4,The top of a laptop with a blue background and dark blue text.,23431,"[the, top, of, a, laptop, with, a, blue, background, and, dark, blue, text, .]"


Compare captions using BLEU score

In [None]:
kosmos_score = {}
# baseline_scores = {}
for image_id in range(7750):
  hypothesis = nltk.tokenize.word_tokenize(df_captions[df_captions['image_id'] == image_id]['caption'].to_list()[0].lower())
  indexes = df_reference.index[df_reference['image_id'] == image_id].tolist()
  bleu_score = nltk.translate.bleu_score.sentence_bleu(df_reference.iloc[indexes]['tokens'], hypothesis, weights=(0.5,0.5))
  kosmos_score[image_id] = round(bleu_score, 4)

The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


In [None]:
kosmos_score

{0: 0.4022,
 1: 0.6124,
 2: 0.7746,
 3: 0.2026,
 4: 0.5003,
 5: 0.174,
 6: 0,
 7: 0.2582,
 8: 0.0,
 9: 0.345,
 10: 0.4837,
 11: 0.2476,
 12: 0.3204,
 13: 0.0,
 14: 0.2317,
 15: 0.2462,
 16: 0.1741,
 17: 0.2988,
 18: 0.4415,
 19: 0.201,
 20: 0.3403,
 21: 0.2752,
 22: 0.4833,
 23: 0.7071,
 24: 0.363,
 25: 0.2749,
 26: 0.3132,
 27: 0,
 28: 0.2583,
 29: 0.1543,
 30: 0.4226,
 31: 0.3091,
 32: 0.4082,
 33: 0.3015,
 34: 0.686,
 35: 0.2009,
 36: 0.5938,
 37: 0.0,
 38: 0.1741,
 39: 0.4022,
 40: 0.0,
 41: 0.4469,
 42: 0.5578,
 43: 0.3015,
 44: 0.2523,
 45: 0.2344,
 46: 0.2081,
 47: 0.3976,
 48: 0.5158,
 49: 0.4412,
 50: 0.0,
 51: 0.3989,
 52: 0.3727,
 53: 0.2988,
 54: 0.4474,
 55: 0.2357,
 56: 0.4082,
 57: 0.6136,
 58: 0.4924,
 59: 0.3132,
 60: 0.108,
 61: 0.0,
 62: 0.201,
 63: 0.4226,
 64: 0.2789,
 65: 0.4495,
 66: 0.4064,
 67: 0.4753,
 68: 0.6553,
 69: 0.4369,
 70: 0.1543,
 71: 0.4045,
 72: 0.7426,
 73: 0.423,
 74: 0.2593,
 75: 0.5164,
 76: 0.21,
 77: 0.3297,
 78: 0.4804,
 79: 0.4634,
 80: 0.3

In [None]:
with open('/content/drive/MyDrive/Colab Notebooks/BLEU_val_baseline_scores.json') as jsonfile:
    baseline_scores = json.load(jsonfile)

In [None]:
baseline_scores

{'0': 0.3783,
 '1': 0.3805,
 '2': 0.4662,
 '3': 0.4165,
 '4': 0.246,
 '5': 0.5528,
 '6': 1.0,
 '7': 0.6,
 '8': 0.7181,
 '9': 0.2553,
 '10': 0.3405,
 '11': 0.2565,
 '12': 0.0832,
 '13': 0.8,
 '14': 0.8,
 '15': 0.1769,
 '16': 0.2261,
 '17': 0.3101,
 '18': 0.4457,
 '19': 0.2885,
 '20': 0.1049,
 '21': 0.0618,
 '22': 0.1427,
 '23': 0.441,
 '24': 0.412,
 '25': 0.1485,
 '26': 0.3926,
 '27': 1.0,
 '28': 0.4124,
 '29': 0.0,
 '30': 0.2099,
 '31': 0.3637,
 '32': 0.3901,
 '33': 0.5833,
 '34': 0.2986,
 '35': 0.4907,
 '36': 0.3368,
 '37': 0.0,
 '38': 0.8,
 '39': 0.1567,
 '40': 0.3265,
 '41': 0.4317,
 '42': 0.563,
 '43': 0.4,
 '44': 0.2257,
 '45': 0.4679,
 '46': 0.3485,
 '47': 0.2818,
 '48': 0.4611,
 '49': 0.1759,
 '50': 0.0916,
 '51': 0.2604,
 '52': 0.0,
 '53': 0.3403,
 '54': 0.383,
 '55': 0.1794,
 '56': 0.1116,
 '57': 0.3196,
 '58': 0.3821,
 '59': 0.2106,
 '60': 0.4786,
 '61': 0.1998,
 '62': 0.4792,
 '63': 0.5633,
 '64': 0.7042,
 '65': 0.0856,
 '66': 0.1902,
 '67': 0.4072,
 '68': 0.4234,
 '69': 0.2

In [None]:
kosmos_score_mean = sum(kosmos_score.values())/len(kosmos_score)
baseline_scores_mean = sum(baseline_scores.values())/len(baseline_scores)

print('kosmos scores mean,', kosmos_score_mean)
print('baseline scores mean', baseline_scores_mean)

kosmos scores mean, 0.3240364645161291
baseline scores mean 0.3938580000000031


Compare labels by tokenization

In [None]:
df_reference

Unnamed: 0,caption,image_id,tokens
0,A computer screen shows a repair prompt on the screen.,0,"[a, computer, screen, shows, a, repair, prompt, on, the, screen, .]"
1,a computer screen with a repair automatically pop up,0,"[a, computer, screen, with, a, repair, automatically, pop, up]"
2,partial computer screen showing the need of repairs,0,"[partial, computer, screen, showing, the, need, of, repairs]"
3,Part of a computer monitor showing a computer repair message.,0,"[part, of, a, computer, monitor, showing, a, computer, repair, message, .]"
4,The top of a laptop with a blue background and dark blue text.,0,"[the, top, of, a, laptop, with, a, blue, background, and, dark, blue, text, .]"
...,...,...,...
38745,A closeup of fiat with a value of 10 on a kitchen counter.,7749,"[a, closeup, of, fiat, with, a, value, of, 10, on, a, kitchen, counter, .]"
38746,a single piece of Australian currency sitting on a table.,7749,"[a, single, piece, of, australian, currency, sitting, on, a, table, .]"
38747,An Australian monetary bill with the value of 10 displayed.,7749,"[an, australian, monetary, bill, with, the, value, of, 10, displayed, .]"
38748,An Australian ten dollar bill sitting on a brown surface.,7749,"[an, australian, ten, dollar, bill, sitting, on, a, brown, surface, .]"


In [None]:
df_hypothesis = df[['image_id','caption']].drop_duplicates()
df_hypothesis = df_hypothesis.rename(columns={'caption':'hypothesis'})
print(df_hypothesis.head())

   image_id  \
0         0   
2         1   
4         2   
7         3   
8         4   

                                                                                 hypothesis  
0  a computer screen with a message that says "You must repair the computer automatically."  
2                                                          a bottle of Night Tree in a hand  
4                            a book cover with a dog and a cow walking down a snowy street.  
7                                                                      two boxes of condoms  
8                                           a computer screen with a ZoomText error message  


In [None]:
df_joint = df_reference.join(df_hypothesis.set_index('image_id'), on='image_id', how='left')
df_joint.head()

Unnamed: 0,caption,image_id,tokens,hypothesis
0,A computer screen shows a repair prompt on the screen.,0,"[a, computer, screen, shows, a, repair, prompt, on, the, screen, .]","a computer screen with a message that says ""You must repair the computer automatically."""
1,a computer screen with a repair automatically pop up,0,"[a, computer, screen, with, a, repair, automatically, pop, up]","a computer screen with a message that says ""You must repair the computer automatically."""
2,partial computer screen showing the need of repairs,0,"[partial, computer, screen, showing, the, need, of, repairs]","a computer screen with a message that says ""You must repair the computer automatically."""
3,Part of a computer monitor showing a computer repair message.,0,"[part, of, a, computer, monitor, showing, a, computer, repair, message, .]","a computer screen with a message that says ""You must repair the computer automatically."""
4,The top of a laptop with a blue background and dark blue text.,0,"[the, top, of, a, laptop, with, a, blue, background, and, dark, blue, text, .]","a computer screen with a message that says ""You must repair the computer automatically."""


In [None]:
df_joint = df_joint.rename(columns={'tokens':'reference_tokens', 'caption':'reference'})
df_joint.head()

Unnamed: 0,reference,image_id,reference_tokens,hypothesis
0,A computer screen shows a repair prompt on the screen.,0,"[a, computer, screen, shows, a, repair, prompt, on, the, screen, .]","a computer screen with a message that says ""You must repair the computer automatically."""
1,a computer screen with a repair automatically pop up,0,"[a, computer, screen, with, a, repair, automatically, pop, up]","a computer screen with a message that says ""You must repair the computer automatically."""
2,partial computer screen showing the need of repairs,0,"[partial, computer, screen, showing, the, need, of, repairs]","a computer screen with a message that says ""You must repair the computer automatically."""
3,Part of a computer monitor showing a computer repair message.,0,"[part, of, a, computer, monitor, showing, a, computer, repair, message, .]","a computer screen with a message that says ""You must repair the computer automatically."""
4,The top of a laptop with a blue background and dark blue text.,0,"[the, top, of, a, laptop, with, a, blue, background, and, dark, blue, text, .]","a computer screen with a message that says ""You must repair the computer automatically."""


In [None]:
df_joint.loc[:, 'hypothesis_tokens'] = df_joint['hypothesis'].str.lower().apply(nltk.word_tokenize)

df_joint['reference_tokens'] = df_joint['reference_tokens'].apply(lambda x: [w for w in x if not w in stop_words and w.isalpha()])
df_joint['hypothesis_tokens'] = df_joint['hypothesis_tokens'].apply(lambda x: [w for w in x if not w in stop_words and w.isalpha()])

df_joint.head(20)

Unnamed: 0,reference,image_id,reference_tokens,hypothesis,hypothesis_tokens,matched tokens
0,A computer screen shows a repair prompt on the screen.,0,"[computer, screen, shows, repair, prompt, screen]","a computer screen with a message that says ""You must repair the computer automatically.""","[computer, screen, message, says, must, repair, computer, automatically]","[., repair, screen, computer]"
1,a computer screen with a repair automatically pop up,0,"[computer, screen, repair, automatically, pop]","a computer screen with a message that says ""You must repair the computer automatically.""","[computer, screen, message, says, must, repair, computer, automatically]","[computer, repair, screen, automatically]"
2,partial computer screen showing the need of repairs,0,"[partial, computer, screen, showing, need, repairs]","a computer screen with a message that says ""You must repair the computer automatically.""","[computer, screen, message, says, must, repair, computer, automatically]","[screen, computer]"
3,Part of a computer monitor showing a computer repair message.,0,"[part, computer, monitor, showing, computer, repair, message]","a computer screen with a message that says ""You must repair the computer automatically.""","[computer, screen, message, says, must, repair, computer, automatically]","[., message, repair, computer]"
4,The top of a laptop with a blue background and dark blue text.,0,"[top, laptop, blue, background, dark, blue, text]","a computer screen with a message that says ""You must repair the computer automatically.""","[computer, screen, message, says, must, repair, computer, automatically]",[.]
5,A person is holding a bottle that has medicine for the night time.,1,"[person, holding, bottle, medicine, night, time]",a bottle of Night Tree in a hand,"[bottle, night, tree, hand]","[bottle, night]"
6,A bottle of medication has a white twist top.,1,"[bottle, medication, white, twist, top]",a bottle of Night Tree in a hand,"[bottle, night, tree, hand]",[bottle]
7,night time medication bottle being held by someone,1,"[night, time, medication, bottle, held, someone]",a bottle of Night Tree in a hand,"[bottle, night, tree, hand]","[bottle, night]"
8,a person holding a small black bottle of NIGHT TIME,1,"[person, holding, small, black, bottle, night, time]",a bottle of Night Tree in a hand,"[bottle, night, tree, hand]","[bottle, night]"
9,A bottle of what appears to be cough syrup held in hand.,1,"[bottle, appears, cough, syrup, held, hand]",a bottle of Night Tree in a hand,"[bottle, night, tree, hand]","[bottle, hand]"


In [None]:
df_joint['matched tokens'] = [list(set(a) & set(b)) for a, b in zip(df_joint['reference_tokens'], df_joint['hypothesis_tokens'])]
df_joint.head(20)

Unnamed: 0,reference,image_id,reference_tokens,hypothesis,hypothesis_tokens,matched tokens
0,A computer screen shows a repair prompt on the screen.,0,"[computer, screen, shows, repair, prompt, screen]","a computer screen with a message that says ""You must repair the computer automatically.""","[computer, screen, message, says, must, repair, computer, automatically]","[repair, screen, computer]"
1,a computer screen with a repair automatically pop up,0,"[computer, screen, repair, automatically, pop]","a computer screen with a message that says ""You must repair the computer automatically.""","[computer, screen, message, says, must, repair, computer, automatically]","[computer, repair, screen, automatically]"
2,partial computer screen showing the need of repairs,0,"[partial, computer, screen, showing, need, repairs]","a computer screen with a message that says ""You must repair the computer automatically.""","[computer, screen, message, says, must, repair, computer, automatically]","[screen, computer]"
3,Part of a computer monitor showing a computer repair message.,0,"[part, computer, monitor, showing, computer, repair, message]","a computer screen with a message that says ""You must repair the computer automatically.""","[computer, screen, message, says, must, repair, computer, automatically]","[message, repair, computer]"
4,The top of a laptop with a blue background and dark blue text.,0,"[top, laptop, blue, background, dark, blue, text]","a computer screen with a message that says ""You must repair the computer automatically.""","[computer, screen, message, says, must, repair, computer, automatically]",[]
5,A person is holding a bottle that has medicine for the night time.,1,"[person, holding, bottle, medicine, night, time]",a bottle of Night Tree in a hand,"[bottle, night, tree, hand]","[bottle, night]"
6,A bottle of medication has a white twist top.,1,"[bottle, medication, white, twist, top]",a bottle of Night Tree in a hand,"[bottle, night, tree, hand]",[bottle]
7,night time medication bottle being held by someone,1,"[night, time, medication, bottle, held, someone]",a bottle of Night Tree in a hand,"[bottle, night, tree, hand]","[bottle, night]"
8,a person holding a small black bottle of NIGHT TIME,1,"[person, holding, small, black, bottle, night, time]",a bottle of Night Tree in a hand,"[bottle, night, tree, hand]","[bottle, night]"
9,A bottle of what appears to be cough syrup held in hand.,1,"[bottle, appears, cough, syrup, held, hand]",a bottle of Night Tree in a hand,"[bottle, night, tree, hand]","[bottle, hand]"


In [None]:
df_joint.head(50)

Unnamed: 0,reference,image_id,reference_tokens,hypothesis,hypothesis_tokens,matched tokens
0,A computer screen shows a repair prompt on the screen.,0,"[computer, screen, shows, repair, prompt, screen]","a computer screen with a message that says ""You must repair the computer automatically.""","[computer, screen, message, says, must, repair, computer, automatically]","[repair, screen, computer]"
1,a computer screen with a repair automatically pop up,0,"[computer, screen, repair, automatically, pop]","a computer screen with a message that says ""You must repair the computer automatically.""","[computer, screen, message, says, must, repair, computer, automatically]","[computer, repair, screen, automatically]"
2,partial computer screen showing the need of repairs,0,"[partial, computer, screen, showing, need, repairs]","a computer screen with a message that says ""You must repair the computer automatically.""","[computer, screen, message, says, must, repair, computer, automatically]","[screen, computer]"
3,Part of a computer monitor showing a computer repair message.,0,"[part, computer, monitor, showing, computer, repair, message]","a computer screen with a message that says ""You must repair the computer automatically.""","[computer, screen, message, says, must, repair, computer, automatically]","[message, repair, computer]"
4,The top of a laptop with a blue background and dark blue text.,0,"[top, laptop, blue, background, dark, blue, text]","a computer screen with a message that says ""You must repair the computer automatically.""","[computer, screen, message, says, must, repair, computer, automatically]",[]
5,A person is holding a bottle that has medicine for the night time.,1,"[person, holding, bottle, medicine, night, time]",a bottle of Night Tree in a hand,"[bottle, night, tree, hand]","[bottle, night]"
6,A bottle of medication has a white twist top.,1,"[bottle, medication, white, twist, top]",a bottle of Night Tree in a hand,"[bottle, night, tree, hand]",[bottle]
7,night time medication bottle being held by someone,1,"[night, time, medication, bottle, held, someone]",a bottle of Night Tree in a hand,"[bottle, night, tree, hand]","[bottle, night]"
8,a person holding a small black bottle of NIGHT TIME,1,"[person, holding, small, black, bottle, night, time]",a bottle of Night Tree in a hand,"[bottle, night, tree, hand]","[bottle, night]"
9,A bottle of what appears to be cough syrup held in hand.,1,"[bottle, appears, cough, syrup, held, hand]",a bottle of Night Tree in a hand,"[bottle, night, tree, hand]","[bottle, hand]"


In [None]:
df_token_match = df_joint.groupby('image_id').agg({'matched tokens': ['sum']})
df_token_match.head()

Unnamed: 0_level_0,matched tokens
Unnamed: 0_level_1,sum
image_id,Unnamed: 1_level_2
0,"[repair, screen, computer, computer, repair, screen, automatically, screen, computer, message, repair, computer]"
1,"[bottle, night, bottle, bottle, night, bottle, night, bottle, hand]"
2,"[dog, book, cover, book, street, dog, snowy, walking, book, cover, book, street, dog, cover, walking]"
3,"[two, boxes, two, boxes, two, boxes]"
4,"[computer, message, error, message, error, screen, computer]"


In [None]:
df_token_match.columns = [' '.join(col).strip() for col in df_token_match.columns.values]
df_token_match.reset_index(inplace=True)
df_token_match.head()

Unnamed: 0,image_id,matched tokens sum
0,0,"[repair, screen, computer, computer, repair, screen, automatically, screen, computer, message, repair, computer]"
1,1,"[bottle, night, bottle, bottle, night, bottle, night, bottle, hand]"
2,2,"[dog, book, cover, book, street, dog, snowy, walking, book, cover, book, street, dog, cover, walking]"
3,3,"[two, boxes, two, boxes, two, boxes]"
4,4,"[computer, message, error, message, error, screen, computer]"


In [None]:
df_token_match.head(20)

Unnamed: 0,image_id,matched tokens sum
0,0,"[repair, screen, computer, computer, repair, screen, automatically, screen, computer, message, repair, computer]"
1,1,"[bottle, night, bottle, bottle, night, bottle, night, bottle, hand]"
2,2,"[dog, book, cover, book, street, dog, snowy, walking, book, cover, book, street, dog, cover, walking]"
3,3,"[two, boxes, two, boxes, two, boxes]"
4,4,"[computer, message, error, message, error, screen, computer]"
5,5,"[screen, windows, screen, windows, screen, windows, screen, windows, screen]"
6,6,[]
7,7,[blurry]
8,8,[]
9,9,"[window, window, window]"


In [None]:
df_token_match['matched'] = [1 if len(x) > 0 else 0 for x in df_token_match['matched tokens sum']]
df_token_match.head(10)

Unnamed: 0,image_id,matched tokens sum,matched
0,0,"[repair, screen, computer, computer, repair, screen, automatically, screen, computer, message, repair, computer]",1
1,1,"[bottle, night, bottle, bottle, night, bottle, night, bottle, hand]",1
2,2,"[dog, book, cover, book, street, dog, snowy, walking, book, cover, book, street, dog, cover, walking]",1
3,3,"[two, boxes, two, boxes, two, boxes]",1
4,4,"[computer, message, error, message, error, screen, computer]",1
5,5,"[screen, windows, screen, windows, screen, windows, screen, windows, screen]",1
6,6,[],0
7,7,[blurry],1
8,8,[],0
9,9,"[window, window, window]",1


In [None]:
kosmos_tokens_match_score = sum(df_token_match['matched'])/len(df_token_match)
print('kosmos_tokens_match_score:', kosmos_tokens_match_score)

kosmos_tokens_match_score: 0.8901935483870967
