## 01. data preprocess

In [4]:
import json

### prepare test data - follow karpathy testset (order as well)

In [5]:
# karpathy split
with open("../data/raw/coco/dataset_coco_karpathy.json") as f:
    karpathy = json.load(f)
    
karpathy_test = [ x for x in karpathy["images"] if x["split"] == "test"]      
len(karpathy_test)

with open("../data/processed/finecap/test_keys.txt", "w") as f:
    for item in karpathy_test:
        f.write(str(item["cocoid"]) + "\n")    

In [6]:
# read test_key
with open("../data/processed/finecap/test_keys.txt") as f:
    test_key = f.readlines()
    test_key = [x.strip() for x in test_key]

In [11]:
# read data
# with open("../data/processed/finecap/clip_rn50_transformer_scl_pl_scst_cider_test.json") as f:
# with open("../data/processed/finecap/clip_rn50_transformer_scl_pl_scst_clipscore_test.json") as f:
with open("../data/processed/finecap/clip_rn50_transformer_scl_pl_scst_clipscore_grammar_normalized_test.json") as f:
    data = json.load(f)
    data = data["imgToEval"]

In [12]:
# candidate caption
with open("../data/processed/finecap/cand_clip_norm.txt", "w") as f:
    for key in test_key:
        f.write(data[key]["caption"] + "\n")    

In [7]:
# ground truth caption

# karpathy split
with open("../data/raw/coco/dataset_coco_karpathy.json") as f:
    karpathy = json.load(f)
karpathy_test = [ x for x in karpathy["images"] if x["split"] == "test"]      
len(karpathy_test)

with open("../data/processed/finecap/gt.txt", "w") as f:
    for key, item in zip(test_key, karpathy_test):
        if key != str(item["cocoid"]):
            print("error")
            break            
        else:            
            for index, gt in enumerate(item["sentences"]):
                if index > 4:
                    continue
                f.write(gt["raw"].strip() + "\n")

In [8]:
# image path

with open("../data/processed/finecap/imgs.txt", "w") as f:
    for key, item in zip(test_key, karpathy_test):
        if key != str(item["cocoid"]):
            print("error")
            break
        else:
            f.write("../data/raw/coco/" + item["filepath"] + "/" +  item["filename"] + "\n")

## 02-A compute score - feature extraction

In [12]:
IMAGE_PATH = "../data/processed/finecap/imgs.txt"         # input
IMAGE_FEATURE = "../data/processed/finecap/image_features.pkl"  # output

FEATURE_EXTRACTOR_BATCH_SIZE = 4
DETECTRON_MODEL  = "../data/detection/detectron_model.pth"
DETECTRON_CONFIG = "../data/detection/detectron_config.yaml"

In [13]:
import sys
sys.path.append("../vilbert")
from extract_features_custom import FeatureExtractor

feature_extractor = FeatureExtractor(model_file    = DETECTRON_MODEL,
                                     config_file   = DETECTRON_CONFIG,
                                     batch_size    = FEATURE_EXTRACTOR_BATCH_SIZE,
)


Summary
----------------------------------------------------------------------
 - model_file                         : ../data/detection/detectron_model.pth
 - config_file                        : ../data/detection/detectron_config.yaml
 - batch_size                         : 4
 - num_features                       : 100
 - output_folder                      : 
 - image_dir                          : 
 - feature_name                       : fc6
 - confidence_threshold               : 0
 - background                         : False
 - partition                          : 0


In [14]:
with open(IMAGE_PATH) as f:
    image_path = f.readlines()

image_path = [x.strip() for x in image_path]
# image_path = image_path[2500:]
len(image_path)

2500

In [15]:
img_features = feature_extractor.extract_features_direct(image_path)
assert len(image_path) == len(img_features)

625it [46:23,  4.57s/it]


In [16]:
import pickle
with open(IMAGE_FEATURE, "wb") as f:
    pickle.dump(img_features, f)

In [4]:
with open("../data/processed/finecap/image_features_01.pkl", "rb") as f:
    data_01 = pickle.load(f)

In [5]:
with open("../data/processed/finecap/image_features_02.pkl", "rb") as f:
    data_02 = pickle.load(f)

In [16]:
data = []
data.extend(data_01)
data.extend(data_02)
len(data)

5000

In [25]:
with open("../data/processed/finecap/image_features.pkl", "wb") as f:
    pickle.dump(data, f)

## 02-B compute score

In [10]:
IMAGE_FEATURE = "../data/processed/finecap/image_features.pkl"  # output from step 01
GENERATED_CAPTION = "../data/processed/finecap/cand_clip_norm.txt"
GT_CAPTION = "../data/processed/finecap/gt.txt"

VS_BATCH_SIZE = 400

In [11]:
import pandas as pd
import sys
sys.path.append("../vilbert")
import pickle
from compute_vilbertscore_custom import VilbertScore
vs = VilbertScore(batch_size=VS_BATCH_SIZE)

In [12]:
with open(IMAGE_FEATURE, "rb") as f:
    imgs = pickle.load(f)
            
with open(GENERATED_CAPTION, "r") as f:
    cand_caps = f.readlines()
    cand_caps = [x.strip() for x in cand_caps]

with open(GT_CAPTION, "r") as f:
    gt_caps = f.readlines()
    gt_caps = [x.strip() for x in gt_caps]
    
print(len(imgs), len(cand_caps), len(gt_caps))
num_itr = int(len(gt_caps) / len(cand_caps))
print("num_itr: ", num_itr)

5000 5000 25000
num_itr:  5


In [13]:
# multi gt case
for i in range(num_itr):
    
    # subset of gt caption
    sub_gt_cap = []
    for index, cap in enumerate(gt_caps):
        if index % num_itr == i:
            sub_gt_cap.append(cap)    
    
    # load dataset to compute
    vs.loaddata(list_image_feature = imgs,
                list_gen_caption = cand_caps,
                list_gt_caption = sub_gt_cap,
                max_len=50
               )
    
    precision, recall, f1 = vs.compute()
    
    if i==0:
        df = pd.DataFrame(data=[precision, recall, f1]).T
    else:
        df_tmp = pd.DataFrame(data=[precision, recall, f1]).T
        df = pd.concat([df, df_tmp])
        
df = df.rename(columns={0:"precision", 1:"recall", 2:"f1"})

  0%|          | 0/13 [00:00<?, ?it/s]

target data
Generated Captions (2 samples):  ['a person wearing a red jacket riding a bike on a dirt path with the mountain', 'a young woman and a child eating a plate of cake at the table']
Ground truth Captions (2 samples):  ['A man with a red helmet on a small moped on a dirt road.', 'A young girl inhales with the intent of blowing out a candle.']


100%|██████████| 13/13 [00:52<00:00,  3.55s/it]
  0%|          | 0/13 [00:00<?, ?it/s]

target data
Generated Captions (2 samples):  ['a person wearing a red jacket riding a bike on a dirt path with the mountain', 'a young woman and a child eating a plate of cake at the table']
Ground truth Captions (2 samples):  ['Man riding a motor bike on a dirt road on the countryside.', 'A young girl is preparing to blow out her candle.']


100%|██████████| 13/13 [00:52<00:00,  3.60s/it]
  0%|          | 0/13 [00:00<?, ?it/s]

target data
Generated Captions (2 samples):  ['a person wearing a red jacket riding a bike on a dirt path with the mountain', 'a young woman and a child eating a plate of cake at the table']
Ground truth Captions (2 samples):  ['A man riding on the back of a motorcycle.', 'A kid is to blow out the single candle in a bowl of birthday goodness.']


100%|██████████| 13/13 [00:53<00:00,  3.61s/it]
  0%|          | 0/13 [00:00<?, ?it/s]

target data
Generated Captions (2 samples):  ['a person wearing a red jacket riding a bike on a dirt path with the mountain', 'a young woman and a child eating a plate of cake at the table']
Ground truth Captions (2 samples):  ['A dirt path with a young person on a motor bike rests to the foreground of a verdant area with a bridge and a background of cloud-wreathed mountains.', 'Girl blowing out the candle on an ice-cream']


100%|██████████| 13/13 [00:53<00:00,  3.63s/it]
  0%|          | 0/13 [00:00<?, ?it/s]

target data
Generated Captions (2 samples):  ['a person wearing a red jacket riding a bike on a dirt path with the mountain', 'a young woman and a child eating a plate of cake at the table']
Ground truth Captions (2 samples):  ['A man in a red shirt and a red hat is on a motorcycle on a hill side.', 'A little girl is getting ready to blow out a candle on a small dessert.']


100%|██████████| 13/13 [00:53<00:00,  3.62s/it]


### analysis

In [14]:
import numpy as np

In [15]:
scores_precision = []
for index in range( len(cand_caps) ):
    scores_precision.append( np.sum(df["precision"][index])/5 )
    
scores_recall = []
for index in range( len(cand_caps) ):
    scores_recall.append( np.sum(df["recall"][index])/5 )
    
scores_f1 = []
for index in range( len(cand_caps) ):
    scores_f1.append( np.sum(df["f1"][index])/5 )

In [16]:
df_rst = pd.DataFrame([scores_precision, scores_recall, scores_f1]).T
df_rst = df_rst.rename(columns={0:"precision", 1:"recall", 2:"f1"})

In [17]:
df_rst.to_csv("../data/processed/finecap/vilbertscore_clip_norm.csv", index=False)

### check results

In [8]:
import pandas as pd

In [9]:
df_cider = pd.read_csv("../data/processed/finecap/vilbertscore_cider.csv")
df_cider.mean()

precision    0.904526
recall       0.886087
f1           0.894967
dtype: float64

In [10]:
df_clip = pd.read_csv("../data/processed/finecap/vilbertscore_clip.csv")
df_clip.mean()

precision    0.812533
recall       0.868545
f1           0.839295
dtype: float64

In [11]:
df_clip_norm = pd.read_csv("../data/processed/finecap/vilbertscore_clip_norm.csv")
df_clip_norm.mean()

precision    0.877505
recall       0.884602
f1           0.880798
dtype: float64