## 01. data preprocess

In [1]:
import json

### prepare test data - follow karpathy testset (order as well)

In [2]:
# karpathy split
with open("../data/raw/coco/dataset_coco_karpathy.json") as f:
    karpathy = json.load(f)
    
karpathy_test = [ x for x in karpathy["images"] if x["split"] == "test"]      
len(karpathy_test)

with open("../data/processed/finecap/test_keys.txt", "w") as f:
    for item in karpathy_test:
        f.write(str(item["cocoid"]) + "\n")    

In [3]:
# read test_key
with open("../data/processed/finecap/test_keys.txt") as f:
    test_key = f.readlines()
    test_key = [x.strip() for x in test_key]

In [4]:
# read data
with open("../data/processed/finecap/clip_rn50_transformer_scl_pl_scst_cider_test.json") as f:
    data = json.load(f)
    data = data["imgToEval"]

In [5]:
# candidate caption
with open("../data/processed/finecap/cand_cider.txt", "w") as f:
    for key in test_key:
        f.write(data[key]["caption"] + "\n")    

In [7]:
# ground truth caption

# karpathy split
with open("../data/raw/coco/dataset_coco_karpathy.json") as f:
    karpathy = json.load(f)
karpathy_test = [ x for x in karpathy["images"] if x["split"] == "test"]      
len(karpathy_test)

with open("../data/processed/finecap/gt.txt", "w") as f:
    for key, item in zip(test_key, karpathy_test):
        if key != str(item["cocoid"]):
            print("error")
            break            
        else:            
            for index, gt in enumerate(item["sentences"]):
                if index > 4:
                    continue
                f.write(gt["raw"].strip() + "\n")

In [8]:
# image path

with open("../data/processed/finecap/imgs.txt", "w") as f:
    for key, item in zip(test_key, karpathy_test):
        if key != str(item["cocoid"]):
            print("error")
            break
        else:
            f.write("../data/raw/coco/" + item["filepath"] + "/" +  item["filename"] + "\n")

## 02-A compute score - feature extraction

In [12]:
IMAGE_PATH = "../data/processed/finecap/imgs.txt"         # input
IMAGE_FEATURE = "../data/processed/finecap/image_features_02.pkl"  # output

FEATURE_EXTRACTOR_BATCH_SIZE = 4
DETECTRON_MODEL  = "../data/detection/detectron_model.pth"
DETECTRON_CONFIG = "../data/detection/detectron_config.yaml"

In [13]:
import sys
sys.path.append("../vilbert")
from extract_features_custom import FeatureExtractor

feature_extractor = FeatureExtractor(model_file    = DETECTRON_MODEL,
                                     config_file   = DETECTRON_CONFIG,
                                     batch_size    = FEATURE_EXTRACTOR_BATCH_SIZE,
)


Summary
----------------------------------------------------------------------
 - model_file                         : ../data/detection/detectron_model.pth
 - config_file                        : ../data/detection/detectron_config.yaml
 - batch_size                         : 4
 - num_features                       : 100
 - output_folder                      : 
 - image_dir                          : 
 - feature_name                       : fc6
 - confidence_threshold               : 0
 - background                         : False
 - partition                          : 0


In [14]:
with open(IMAGE_PATH) as f:
    image_path = f.readlines()

image_path = [x.strip() for x in image_path]
image_path = image_path[2500:]
len(image_path)

2500

In [None]:
img_features = feature_extractor.extract_features_direct(image_path)
assert len(image_path) == len(img_features)

304it [22:31,  4.38s/it]

In [None]:
import pickle
with open(IMAGE_FEATURE, "wb") as f:
    pickle.dump(img_features, f)

## 02-B compute score

In [2]:
IMAGE_FEATURE = "../data/processed/capeval1k/image_features.pkl"  # output from step 01
GENERATED_CAPTION = "../data/processed/capeval1k/cand.txt"
GT_CAPTION = "../data/processed/capeval1k/gt.txt"

VS_BATCH_SIZE = 400

In [3]:
import pandas as pd
import sys
sys.path.append("../vilbert")
import pickle
from compute_vilbertscore_custom import VilbertScore
vs = VilbertScore(batch_size=VS_BATCH_SIZE)






In [4]:
with open(IMAGE_FEATURE, "rb") as f:
    imgs = pickle.load(f)
            
with open(GENERATED_CAPTION, "r") as f:
    cand_caps = f.readlines()
    cand_caps = [x.strip() for x in cand_caps]

with open(GT_CAPTION, "r") as f:
    gt_caps = f.readlines()
    gt_caps = [x.strip() for x in gt_caps]
    
print(len(imgs), len(cand_caps), len(gt_caps))
num_itr = int(len(gt_caps) / len(cand_caps))
print("num_itr: ", num_itr)

1000 1000 5000
num_itr:  5


In [5]:
# multi gt case
for i in range(num_itr):
    
    # subset of gt caption
    sub_gt_cap = []
    for index, cap in enumerate(gt_caps):
        if index % num_itr == i:
            sub_gt_cap.append(cap)    
    
    # load dataset to compute
    vs.loaddata(list_image_feature = imgs,
                list_gen_caption = cand_caps,
                list_gt_caption = sub_gt_cap,
                max_len=50
               )
    
    precision, recall, f1 = vs.compute()
    
    if i==0:
        df = pd.DataFrame(data=[precision, recall, f1]).T
    else:
        df_tmp = pd.DataFrame(data=[precision, recall, f1]).T
        df = pd.concat([df, df_tmp])
        
df = df.rename(columns={0:"precision", 1:"recall", 2:"f1"})

  0%|          | 0/3 [00:00<?, ?it/s]

target data
Generated Captions (2 samples):  ['a woman sitting at a table with a cake', 'a bunch of knives on a cutting board with a knife']
Ground truth Captions (2 samples):  ['a young girl inhales with the intent of blowing out a candle.', 'a wooden ball on top of a wooden stick.']


100%|██████████| 3/3 [00:10<00:00,  3.66s/it]
  0%|          | 0/3 [00:00<?, ?it/s]

target data
Generated Captions (2 samples):  ['a woman sitting at a table with a cake', 'a bunch of knives on a cutting board with a knife']
Ground truth Captions (2 samples):  ['a young girl is preparing to blow out her candle.', 'the table is full of wooden spoons and utensils.']


100%|██████████| 3/3 [00:10<00:00,  3.59s/it]
  0%|          | 0/3 [00:00<?, ?it/s]

target data
Generated Captions (2 samples):  ['a woman sitting at a table with a cake', 'a bunch of knives on a cutting board with a knife']
Ground truth Captions (2 samples):  ['a kid is to blow out the single candle in a bowl of birthday goodness.', 'a wood table holding an assortment of wood cooking utensils.']


100%|██████████| 3/3 [00:10<00:00,  3.61s/it]
  0%|          | 0/3 [00:00<?, ?it/s]

target data
Generated Captions (2 samples):  ['a woman sitting at a table with a cake', 'a bunch of knives on a cutting board with a knife']
Ground truth Captions (2 samples):  ['girl blowing out the candle on an ice-cream', 'a selection of wooden kitchen tools on a counter.']


100%|██████████| 3/3 [00:10<00:00,  3.62s/it]
  0%|          | 0/3 [00:00<?, ?it/s]

target data
Generated Captions (2 samples):  ['a woman sitting at a table with a cake', 'a bunch of knives on a cutting board with a knife']
Ground truth Captions (2 samples):  ['a little girl is getting ready to blow out a candle on a small dessert.', 'wooden spoons are lined up on a table']


100%|██████████| 3/3 [00:10<00:00,  3.63s/it]


In [6]:
df

Unnamed: 0,precision,recall,f1
0,0.869753,0.786476,0.826021
1,0.777076,0.802007,0.789345
2,0.901035,0.862815,0.881511
3,0.853539,0.815534,0.834103
4,0.889996,0.877415,0.883660
...,...,...,...
995,0.904070,0.903172,0.903620
996,0.779771,0.754415,0.766883
997,0.911715,0.884409,0.897855
998,0.884133,0.848135,0.865760


### analysis

In [16]:
import numpy as np

In [18]:
scores_precision = []
for index in range( len(cand_caps) ):
    scores_precision.append( np.sum(df["precision"][index])/5 )
    
scores_recall = []
for index in range( len(cand_caps) ):
    scores_recall.append( np.sum(df["recall"][index])/5 )
    
scores_f1 = []
for index in range( len(cand_caps) ):
    scores_f1.append( np.sum(df["f1"][index])/5 )

In [19]:
df_rst = pd.DataFrame([scores_precision, scores_recall, scores_f1]).T
df_rst = df_rst.rename(columns={0:"precision", 1:"recall", 2:"f1"})

In [20]:
df_rst.to_csv("../data/processed/capeval1k/vilbertscore.csv", index=False)