In [1]:
import os
import pandas as pd
import itertools
from tqdm import tqdm
import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import cv2
from lavis.models import load_model_and_preprocess
import torch
from PIL import Image
import json
from infoseek_eval import evaluate as evaluate_infoseek
import argparse
from infoseek_data.data_path import INFOSEEK_SPLIT2DATA, ID2IMAGE, IMAGES, OVEN_SPLIT2DATA
from peft import LoraConfig, get_peft_model
import random

2024-04-23 12:50:43.025640: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
The installed version of bitsandbytes was compiled without GPU support. 8-bit optimizers, 8-bit multiplication, and GPU quantization are unavailable.


Jupyter environment detected. Enabling Open3D WebVisualizer.
[Open3D INFO] WebRTC GUI backend enabled.
[Open3D INFO] WebRTCWindowSystem: HTTP handshake server disabled.


In [2]:
split2data = {
    "val": "infoseek_data/infoseek_val.jsonl",
    "val_withkb": "infoseek_data/infoseek_val_withkb.jsonl",
    "test": "infoseek_data/infoseek_test.jsonl",
    "human": "infoseek_data/infoseek_human.jsonl",
    "train": "infoseek_data/infoseek_train.jsonl",
    "train_withkb": "infoseek_data/infoseek_train_withkb.jsonl"
}

qtype = {
    "val": "infoseek_qtype/infoseek_val.jsonl",
    "val_mini": "infoseek_qtype/infoseek_val_mini.jsonl",
    "test": "infoseek_qtype/infoseek_test.jsonl",
    "human": "infoseek_qtype/infoseek_human.jsonl",
    "train": "infoseek_qtype/infoseek_train.jsonl"
}

In [6]:
train_image_id = []
val_image_id = []
val_data_split = []

with open(split2data["train"], "r", encoding="utf-8") as f:    
    for line in f:
        line = json.loads(line)
        train_image_id.append(line["image_id"])
len(train_image_id)

934048

In [4]:
with open(split2data["val"], 'r') as f:
    for line in f:
        line = json.loads(line)
        val_image_id.append(line["image_id"])
        val_data_split.append(line["data_split"])

In [5]:
len(val_image_id)

73620

In [10]:
train_entity_id = []
with open(split2data["train_withkb"], "r", encoding="utf-8") as f:    
    for line in f:
        line = json.loads(line)
        train_entity_id.append(line["entity_id"])

In [11]:
train_entity_id = list(set(train_entity_id))
len(train_entity_id)

5549

In [12]:
val_entity_id = []
with open(split2data["val_withkb"], "r", encoding="utf-8") as f:    
    for line in f:
        line = json.loads(line)
        val_entity_id.append(line["entity_id"])
val_entity_id = list(set(val_entity_id))
len(val_entity_id)

1794

# Statistics Check

In [13]:
# convert jsonl to dataframe
def js2df(file_path):
    # Initialize an empty list to store the JSON objects
    data = []

    # Open the file and read each line
    with open(file_path, 'r') as file:
        for line in file:
            # Convert each line to a dictionary
            json_object = json.loads(line)
            data.append(json_object)

    # Convert the list of dictionaries to a DataFrame
    df = pd.DataFrame(data)
    return df

In [18]:
def merge(df1, df2, key="data_id"):
    df1 = js2df(df1)
    df2 = js2df(df2)
    merged_df = pd.merge(df1, df2, on=key, how='inner')
    return merged_df

In [19]:
train_df = merge(qtype['train'], split2data["train_withkb"])
val_df = merge(qtype['val'], split2data["val_withkb"])

In [20]:
train_df

Unnamed: 0,data_id,image_id,question,answer,answer_eval,data_split,question_type,entity_id,entity_text
0,infoseek_train_00000000,oven_01963180,Which place is this animal endemic to?,[People's Republic of China],"[cn, People's Republic of China, China, Mainla...",train,String,Q33602,Giant panda
1,infoseek_train_00000001,oven_03952028,What is the mohs' hardness of this material?,[7.6],"[{'wikidata': 7.6, 'range': [6.84, 8.36]}]",train,Numerical,Q43513,Emerald
2,infoseek_train_00000002,oven_01857671,What is the conservation status of this animal...,[Endangered],"[Endangered species, Endangered, EN]",train,String,Q192710,Malayan tapir
3,infoseek_train_00000003,oven_02959375,Who is the manufacturer of this vehicle?,[AM General],"[Am General, AM General LLC, AM General, AM Ge...",train,String,Q217046,Humvee
4,infoseek_train_00000004,oven_04416546,What fields are the person in the image specia...,[baking],[baking],train,String,Q160131,Baker
...,...,...,...,...,...,...,...,...,...
934043,infoseek_train_00934043,oven_00670496,How many days does the gestation of this anima...,[30.0-32.0],"[{'wikidata': 31.0, 'range': [30.0, 32.0]}]",train,Numerical,Q221612,Groundhog
934044,infoseek_train_00934044,oven_01683806,What is the oldest age of this animal?,[25],"[{'wikidata': 25.0, 'range': [22.5, 27.5000000...",train,Numerical,Q188657,Striped hyena
934045,infoseek_train_00934045,oven_00270092,What is the minimum number of players of a gam...,[2],"[{'wikidata': 2.0, 'range': [1.8, 2.2]}]",train,Numerical,Q7291,Badminton
934046,infoseek_train_00934046,oven_04651005,What is the source that produces this food?,[Quercus],"[oaks, oak, oak tree, the oak genus, Quercus]",train,String,Q3914781,Acorn


In [21]:
val_df

Unnamed: 0,data_id,image_id,question,answer,answer_eval,data_split,question_type,entity_id,entity_text
0,infoseek_val_00000000,oven_04990048,What is the objective of this object?,[energy transformation],"[energy conversion, energy transformation, con...",val_unseen_question,String,Q178185,Heat engine
1,infoseek_val_00000001,oven_04962816,What is the conservation status of this animal...,[Endangered],"[Endangered species, Endangered, EN]",val_unseen_question,String,Q173651,African wild dog
2,infoseek_val_00000002,oven_04967661,How many days does the gestation of this anima...,[6.5-7.0],"[{'wikidata': 6.75, 'range': [6.5, 7.0]}]",val_unseen_question,Numerical,Q132576,Impala
3,infoseek_val_00000003,oven_05001968,What is the source that produces this plant?,[Vitis labrusca],"[Vitis labrusca, Fox grape]",val_unseen_question,String,Q393047,Concord grape
4,infoseek_val_00000004,oven_04951981,What country does this place belong to?,[Pakistan],"[Islamic Republic of Pakistan, 🇵🇰, pk, PAK, Pa...",val_unseen_question,String,Q996962,Minar-e-Pakistan
...,...,...,...,...,...,...,...,...,...
73615,infoseek_val_00073615,oven_05023559,Which city or region does this building locate...,"[Kyffhäuserland, Bad Frankenhausen]","[Bad Frankenhausen/Kyffhäuser, Bad Frankenhaus...",val_unseen_entity,String,Q573639,Kyffhäuser Monument
73616,infoseek_val_00073616,oven_00004044,In which year did this aircraft come into serv...,[1968],"[1967, 1968, 1969]",val_unseen_entity,Time,Q683041,De Havilland Canada DHC-6 Twin Otter
73617,infoseek_val_00073617,oven_04949031,What type of cuisine is this food?,[Italian cuisine],"[cuisine of Italy, Italian cuisine, Italian co...",val_unseen_entity,String,Q20026,Spaghetti
73618,infoseek_val_00073618,oven_05026314,Which historic county does this facility belon...,[County Dublin],"[co Dublin, Dublin, co. Dublin, County Dublin,...",val_unseen_entity,String,Q917854,Mount Jerome Cemetery and Crematorium


In [22]:
# entity
train_entity = set(train_df['entity_id'])
val_entity = set(val_df['entity_id'])
len(train_entity), len(val_entity)

(5549, 1794)

In [23]:
train_df['entity_id'].value_counts()

entity_id
Q202878     9442
Q2736       5125
Q80811      5051
Q4504       4689
Q7075       4230
            ... 
Q161473        2
Q2187594       2
Q2978738       2
Q2033039       2
Q287007        2
Name: count, Length: 5549, dtype: int64

In [24]:
val_df['entity_id'].value_counts()

entity_id
Q257298     580
Q390370     538
Q217109     515
Q517545     475
Q204871     453
           ... 
Q3487324      2
Q614880       2
Q212047       2
Q6130468      2
Q1940276      2
Name: count, Length: 1794, dtype: int64

In [26]:
train_df['question_type'].value_counts() / len(train_df)

question_type
String       0.751718
Numerical    0.204468
Time         0.043814
Name: count, dtype: float64

In [27]:
val_df['question_type'].value_counts() / len(val_df)

question_type
String       0.737829
Numerical    0.215920
Time         0.046251
Name: count, dtype: float64

In [28]:
val_df['data_split'].value_counts()

data_split
val_unseen_entity      54964
val_unseen_question    18656
Name: count, dtype: int64

In [29]:
val_df_unseen_entity = val_df[val_df['data_split'] == "val_unseen_entity"]
val_df_unseen_question = val_df[val_df['data_split'] == "val_unseen_question"]

In [30]:
len(val_df_unseen_entity), len(val_df_unseen_question)

(54964, 18656)

In [34]:
# unseen entity
print(len(set(train_df['entity_id'])), len(set(val_df_unseen_entity['entity_id'])))
len(set(train_df['entity_id']).intersection(set(val_df_unseen_entity['entity_id'])))

5549 1192


0

In [48]:
# modify
print(len(set(train_df['question'])), len(set(val_df_unseen_entity['question'])))
len(set(train_df['question']).intersection(set(val_df_unseen_entity['question'])))

881 486


376

In [35]:
# unseen question
print(len(set(train_df['entity_id'])), len(set(val_df_unseen_question['entity_id'])))
len(set(train_df['entity_id']).intersection(set(val_df_unseen_question['entity_id'])))

5549 602


602

In [49]:
print(len(set(train_df['question'])), len(set(val_df_unseen_question['question'])))
len(set(train_df['question']).intersection(set(val_df_unseen_question['question'])))

881 360


304

In [54]:
val_unseen_question_seen_entity = list(set(val_df_unseen_question['entity_id']))
sub_df = train_df[train_df['entity_id'].isin(val_unseen_question_seen_entity)]

In [56]:
len(set(sub_df['entity_id']))

602

In [55]:
print(len(set(sub_df['question'])), len(set(val_df_unseen_question['question'])))
len(set(sub_df['question']).intersection(set(val_df_unseen_question['question'])))

356 360


201

In [64]:
sub_df[['entity_id', 'question']], val_df_unseen_question[['entity_id', 'question']]

(       entity_id                                           question
 47       Q858459  how many year do these object in the image typ...
 59          Q525  What is the surface gravity of the place in me...
 68       Q542500           What country does this animal belong to?
 82        Q34706             where do you usually find this animal?
 83        Q14381                     Where is the lake inflow from?
 ...          ...                                                ...
 934022    Q35694            What is the litter size of this animal?
 934030    Q47542         What kind of effect does this animal have?
 934040    Q82738  How many days does the gestation of this anima...
 934043   Q221612  How many days does the gestation of this anima...
 934047    Q25332  What is the conservation status of this bird? ...
 
 [100589 rows x 2 columns],
       entity_id                                           question
 0       Q178185              What is the objective of this object?
 1    

In [71]:
pd.merge(sub_df[['entity_id', 'question']].drop_duplicates(), val_df_unseen_question[['entity_id', 'question']].drop_duplicates(), on=['entity_id', 'question'], how="inner")

Unnamed: 0,entity_id,question
0,Q465811,Who designed this park?


In [73]:
train_df[(train_df['entity_id'] == "Q465811") & (train_df['question'] == "Who designed this park?")]

Unnamed: 0,data_id,image_id,question,answer,answer_eval,data_split,question_type,entity_id,entity_text
11647,infoseek_train_00011647,oven_00031864,Who designed this park?,[Pierre Charles L'Enfant],"[Pierre C. L'Enfant, Pierre L'Enfant, Pierre-C...",train,String,Q465811,National Mall
40971,infoseek_train_00040971,oven_00031896,Who designed this park?,[Pierre Charles L'Enfant],"[Pierre C. L'Enfant, Pierre L'Enfant, Pierre-C...",train,String,Q465811,National Mall
99062,infoseek_train_00099062,oven_00031874,Who designed this park?,[Pierre Charles L'Enfant],"[Pierre C. L'Enfant, Pierre L'Enfant, Pierre-C...",train,String,Q465811,National Mall
107087,infoseek_train_00107087,oven_00031863,Who designed this park?,[Pierre Charles L'Enfant],"[Pierre C. L'Enfant, Pierre L'Enfant, Pierre-C...",train,String,Q465811,National Mall
127451,infoseek_train_00127451,oven_00031893,Who designed this park?,[Pierre Charles L'Enfant],"[Pierre C. L'Enfant, Pierre L'Enfant, Pierre-C...",train,String,Q465811,National Mall
143533,infoseek_train_00143533,oven_00031867,Who designed this park?,[Pierre Charles L'Enfant],"[Pierre C. L'Enfant, Pierre L'Enfant, Pierre-C...",train,String,Q465811,National Mall
180307,infoseek_train_00180307,oven_00031871,Who designed this park?,[Pierre Charles L'Enfant],"[Pierre C. L'Enfant, Pierre L'Enfant, Pierre-C...",train,String,Q465811,National Mall
191163,infoseek_train_00191163,oven_00031875,Who designed this park?,[Pierre Charles L'Enfant],"[Pierre C. L'Enfant, Pierre L'Enfant, Pierre-C...",train,String,Q465811,National Mall
221556,infoseek_train_00221556,oven_00031881,Who designed this park?,[Pierre Charles L'Enfant],"[Pierre C. L'Enfant, Pierre L'Enfant, Pierre-C...",train,String,Q465811,National Mall
234138,infoseek_train_00234138,oven_00031862,Who designed this park?,[Pierre Charles L'Enfant],"[Pierre C. L'Enfant, Pierre L'Enfant, Pierre-C...",train,String,Q465811,National Mall


In [70]:
val_df_unseen_question[(val_df_unseen_question['entity_id'] == "Q465811") & (val_df_unseen_question['question'] == "Who designed this park?")]

Unnamed: 0,data_id,image_id,question,answer,answer_eval,data_split,question_type,entity_id,entity_text
1810,infoseek_val_00001810,oven_04952652,Who designed this park?,[Pierre Charles L'Enfant],"[Pierre C. L'Enfant, Pierre L'Enfant, Pierre-C...",val_unseen_question,String,Q465811,National Mall
3256,infoseek_val_00003256,oven_04952655,Who designed this park?,[Pierre Charles L'Enfant],"[Pierre C. L'Enfant, Pierre L'Enfant, Pierre-C...",val_unseen_question,String,Q465811,National Mall
3834,infoseek_val_00003834,oven_04952665,Who designed this park?,[Pierre Charles L'Enfant],"[Pierre C. L'Enfant, Pierre L'Enfant, Pierre-C...",val_unseen_question,String,Q465811,National Mall
4805,infoseek_val_00004805,oven_04952654,Who designed this park?,[Pierre Charles L'Enfant],"[Pierre C. L'Enfant, Pierre L'Enfant, Pierre-C...",val_unseen_question,String,Q465811,National Mall
5113,infoseek_val_00005113,oven_04952663,Who designed this park?,[Pierre Charles L'Enfant],"[Pierre C. L'Enfant, Pierre L'Enfant, Pierre-C...",val_unseen_question,String,Q465811,National Mall
6093,infoseek_val_00006093,oven_04952656,Who designed this park?,[Pierre Charles L'Enfant],"[Pierre C. L'Enfant, Pierre L'Enfant, Pierre-C...",val_unseen_question,String,Q465811,National Mall
6430,infoseek_val_00006430,oven_04952660,Who designed this park?,[Pierre Charles L'Enfant],"[Pierre C. L'Enfant, Pierre L'Enfant, Pierre-C...",val_unseen_question,String,Q465811,National Mall
6927,infoseek_val_00006927,oven_04952661,Who designed this park?,[Pierre Charles L'Enfant],"[Pierre C. L'Enfant, Pierre L'Enfant, Pierre-C...",val_unseen_question,String,Q465811,National Mall
8853,infoseek_val_00008853,oven_04952658,Who designed this park?,[Pierre Charles L'Enfant],"[Pierre C. L'Enfant, Pierre L'Enfant, Pierre-C...",val_unseen_question,String,Q465811,National Mall
9048,infoseek_val_00009048,oven_04952650,Who designed this park?,[Pierre Charles L'Enfant],"[Pierre C. L'Enfant, Pierre L'Enfant, Pierre-C...",val_unseen_question,String,Q465811,National Mall


In [14]:
js2df(split2data["val_withkb"]).head()

Unnamed: 0,data_id,entity_id,entity_text
0,infoseek_val_00042030,Q934551,Derwentwater
1,infoseek_val_00030546,Q1121020,Compact Disc Digital Audio
2,infoseek_val_00033267,Q155869,Whinchat
3,infoseek_val_00033700,Q20026,Spaghetti
4,infoseek_val_00005702,Q152073,Monserrate


In [85]:
pd.DataFrame(train_df[['image_id', 'question', 'entity_id']].value_counts()).to_csv("count.csv")

Repeating Samples
Question: Who designed this bridge?
Entity ID: Q125821

In [80]:
train_df[(train_df['image_id'] == "oven_00056329") & (train_df['question'] == "Who designed this bridge?")]

Unnamed: 0,data_id,image_id,question,answer,answer_eval,data_split,question_type,entity_id,entity_text
9152,infoseek_train_00009152,oven_00056329,Who designed this bridge?,[Cass Gilbert],[Cass Gilbert],train,String,Q125821,George Washington Bridge
55914,infoseek_train_00055914,oven_00056329,Who designed this bridge?,[Othmar Ammann],"[Othmar H. Ammann, Othmar Ammann, Othmar Herma...",train,String,Q125821,George Washington Bridge


# OVEN

In [36]:
oven = {
    "val_entity": "oven_data/oven_entity_val.jsonl",
    "val_query": "oven_data/oven_query_val.jsonl",
    "train_entity": "oven_data/oven_entity_train.jsonl",
    "train_query": "oven_data/oven_query_train.jsonl",
}

In [38]:
train_df_entity = js2df(oven["train_entity"])

In [40]:
val_df_entity = js2df(oven["val_entity"])
train_df_query = js2df(oven["train_query"])
val_df_query = js2df(oven["val_query"])

In [86]:
train_df_entity

Unnamed: 0,data_id,image_id,question,entity_id,entity_text,data_split
0,oven_entity_train_00000000,oven_00000000,what is the model of this aircraft?,Q1141409,Dassault Falcon 900,entity_train
1,oven_entity_train_00000001,oven_00000001,what is the model of this aircraft?,Q1141409,Dassault Falcon 900,entity_train
2,oven_entity_train_00000002,oven_00000002,what is the model of this aircraft?,Q1141409,Dassault Falcon 900,entity_train
3,oven_entity_train_00000003,oven_00000003,what is the model of this aircraft?,Q1141409,Dassault Falcon 900,entity_train
4,oven_entity_train_00000004,oven_00000004,what is the model of this aircraft?,Q1141409,Dassault Falcon 900,entity_train
...,...,...,...,...,...,...
4926309,oven_entity_train_04926309,oven_04925734,what is the main content of this image?,Q287,Wood,entity_train
4926310,oven_entity_train_04926310,oven_04925735,which type of material is depicted in the image?,Q287,Wood,entity_train
4926311,oven_entity_train_04926311,oven_04925736,what is shown in the photo?,Q287,Wood,entity_train
4926312,oven_entity_train_04926312,oven_04925737,what kind of material is this?,Q287,Wood,entity_train


In [90]:
train_df_query

Unnamed: 0,data_id,image_id,question,entity_id,entity_text,data_split
0,oven_query_train_00000000,oven_05420926,who should we vote for?,Q1623348,Hogan,query_train
1,oven_query_train_00000001,oven_05420927,what character is pictured on the cake?,Q188760,Hulk,query_train
2,oven_query_train_00000002,oven_05420928,what character is pictured on the tablecloth?,Q188760,Hulk,query_train
3,oven_query_train_00000003,oven_05420929,what is on little boys shirt?,Q188760,Hulk,query_train
4,oven_query_train_00000004,oven_05420930,what is on the horses legs?,Q1761743,Red tape,query_train
...,...,...,...,...,...,...
32250,oven_query_train_00032250,oven_05445155,what animal is shown?,Q5113,Bird,query_train
32251,oven_query_train_00032251,oven_05436952,what animal is in this picture?,Q5113,Bird,query_train
32252,oven_query_train_00032252,oven_05445156,what is flying?,Q5113,Bird,query_train
32253,oven_query_train_00032253,oven_05445157,what type of animal is pictured?,Q5113,Bird,query_train


In [87]:
val_df_entity

Unnamed: 0,data_id,image_id,question,entity_id,entity_text,data_split
0,oven_entity_val_00000000,oven_04944518,what is this park called?,Q517545,Nationals Park,entity_val_seen
1,oven_entity_val_00000001,oven_04944519,what is the name of this park?,Q517545,Nationals Park,entity_val_seen
2,oven_entity_val_00000002,oven_04944520,what is the name of this place?,Q517545,Nationals Park,entity_val_seen
3,oven_entity_val_00000003,oven_04944521,where is this park?,Q517545,Nationals Park,entity_val_seen
4,oven_entity_val_00000004,oven_04944522,where is this place?,Q517545,Nationals Park,entity_val_seen
...,...,...,...,...,...,...
126194,oven_entity_val_00126194,oven_05070493,what material is presented in the image?,Q134658,Niacin,entity_val_unseen
126195,oven_entity_val_00126195,oven_05070494,which type of material is depicted in the image?,Q134658,Niacin,entity_val_unseen
126196,oven_entity_val_00126196,oven_05070495,what is the category of this material?,Q134658,Niacin,entity_val_unseen
126197,oven_entity_val_00126197,oven_05070496,what material is presented in the image?,Q134658,Niacin,entity_val_unseen


In [88]:
val_df_query

Unnamed: 0,data_id,image_id,question,entity_id,entity_text,data_split
0,oven_query_val_00000000,oven_04925739,what type of bird are these?,Q755737,Killdeer,query_val_seen
1,oven_query_val_00000001,oven_04925740,what is the person using his foot for?,Q268534,Flush toilet,query_val_seen
2,oven_query_val_00000002,oven_04925741,what is the green vegetable on the plate?,Q37153,Avocado,query_val_seen
3,oven_query_val_00000003,oven_04925742,what is cut up in the bowl?,Q37153,Avocado,query_val_seen
4,oven_query_val_00000004,oven_04925743,what vegetable is on top of the rack?,Q37153,Avocado,query_val_seen
...,...,...,...,...,...,...
3286,oven_query_val_00003286,oven_04937912,what is the man holding?,Q457689,Surfboard,query_val_unseen
3287,oven_query_val_00003287,oven_04937913,what is the person holding?,Q457689,Surfboard,query_val_unseen
3288,oven_query_val_00003288,oven_04937914,what is the man riding?,Q457689,Surfboard,query_val_unseen
3289,oven_query_val_00003289,oven_04937915,what is the person on?,Q457689,Surfboard,query_val_unseen


In [43]:
len(train_df_entity), len(train_df_query)

(4926314, 32255)

In [47]:
len(set(list(val_df_entity['entity_id']) + list(val_df_query['entity_id'])))

3946

In [96]:
# unique query
len(set(list(val_df_entity['question']) + list(val_df_query['question'])))

3124

In [89]:
val_entity_unseen = val_df_entity[val_df_entity['data_split'] == "entity_val_unseen"]
val_entity_seen = val_df_entity[val_df_entity['data_split'] == "entity_val_seen"]

val_query_unseen = val_df_query[val_df_query['data_split'] == "query_val_unseen"]
val_query_seen = val_df_query[val_df_query['data_split'] == "query_val_seen"]

In [91]:
len(val_entity_unseen), len(val_entity_seen), len(val_query_unseen), len(val_query_seen)

(61098, 65101, 2268, 1023)

In [95]:
# seen entities: val_entity_seen + val_query_seen
len(set(list(val_entity_seen['entity_id']) + list(val_query_seen['entity_id'])))

1942

In [99]:
# unseen entities
len(set(list(val_entity_unseen['entity_id']) + list(val_query_unseen['entity_id'])))

2004

In [100]:
# seen
len(val_entity_seen) + len(val_query_seen)

66124

In [101]:
# unseen
len(val_entity_unseen) + len(val_query_unseen)

63366

In [103]:
train_oven = pd.concat([train_df_entity, train_df_query], axis=0)

In [104]:
# seen entity
print(len(set(train_oven['entity_id'])), len(set(val_entity_seen['entity_id'])))
len(set(train_oven['entity_id']).intersection(set(val_entity_seen['entity_id'])))

9788 1721


1712

In [121]:
# test if seen_entity and seen_query are wrt questions as well
print(len(train_oven[['entity_id', 'question']].drop_duplicates()), len(val_entity_seen[['entity_id', 'question']].drop_duplicates()))
len(pd.merge(train_oven[['entity_id', 'question']].drop_duplicates(), val_entity_seen[['entity_id', 'question']].drop_duplicates(), on=['entity_id', 'question'], how="inner"))

72898 10972


9933

In [120]:
print(len(train_oven[['entity_id', 'question']].drop_duplicates()), len(val_query_seen[['entity_id', 'question']].drop_duplicates()))
len(pd.merge(train_oven[['entity_id', 'question']].drop_duplicates(), val_query_seen[['entity_id', 'question']].drop_duplicates(), on=['entity_id', 'question'], how="inner"))

72898 968


165

In [105]:
# unseen entity
print(len(set(train_oven['entity_id'])), len(set(val_entity_unseen['entity_id'])))
len(set(train_oven['entity_id']).intersection(set(val_entity_unseen['entity_id'])))

9788 1471


0

In [106]:
# seen query
print(len(set(train_oven['entity_id'])), len(set(val_query_seen['entity_id'])))
len(set(train_oven['entity_id']).intersection(set(val_query_seen['entity_id'])))

9788 265


264

In [107]:
# unseen query
print(len(set(train_oven['entity_id'])), len(set(val_query_unseen['entity_id'])))
len(set(train_oven['entity_id']).intersection(set(val_query_unseen['entity_id'])))

9788 546


0

## For OVEN, the seen and unseen are only wrt entities

In [108]:
len(set(val_entity_seen['entity_id']).intersection(set(val_query_seen['entity_id'])))

44

In [113]:
same = list(set(val_entity_seen['entity_id']).intersection(set(val_query_seen['entity_id'])))
same

['Q177',
 'Q18398431',
 'Q16472452',
 'Q3487748',
 'Q1548030',
 'Q7639866',
 'Q11946202',
 'Q272502',
 'Q32489',
 'Q12337',
 'Q177941',
 'Q11111030',
 'Q165044',
 'Q34679',
 'Q31087',
 'Q446',
 'Q34706',
 'Q13317',
 'Q18545',
 'Q388450',
 'Q43663',
 'Q166080',
 'Q12493',
 'Q38965',
 'Q13188',
 'Q275447',
 'Q915026',
 'Q13450283',
 'Q8355',
 'Q1072900',
 'Q163446',
 'Q134768',
 'Q215857',
 'Q178024',
 'Q190868',
 'Q1192284',
 'Q5840',
 'Q13187',
 'Q160525',
 'Q208253',
 'Q605384',
 'Q168658',
 'Q5113',
 'Q7743']

In [114]:
val_entity_seen[val_entity_seen['entity_id'] == "Q177"]

Unnamed: 0,data_id,image_id,question,entity_id,entity_text,data_split
14834,oven_entity_val_00014834,oven_04959278,what kind of food is this?,Q177,Pizza,entity_val_seen
14835,oven_entity_val_00014835,oven_04959279,which category of food is shown in the image?,Q177,Pizza,entity_val_seen
14836,oven_entity_val_00014836,oven_04959280,which type of food is depicted in the image?,Q177,Pizza,entity_val_seen
14837,oven_entity_val_00014837,oven_04959281,what kind of food is this?,Q177,Pizza,entity_val_seen
14838,oven_entity_val_00014838,oven_04959282,which category of food is shown in the image?,Q177,Pizza,entity_val_seen
14839,oven_entity_val_00014839,oven_04959283,which type of food is depicted in the image?,Q177,Pizza,entity_val_seen
14840,oven_entity_val_00014840,oven_04959284,what food is presented in the image?,Q177,Pizza,entity_val_seen
14841,oven_entity_val_00014841,oven_04959285,what is this food?,Q177,Pizza,entity_val_seen
14842,oven_entity_val_00014842,oven_04959286,which category of food is shown in the image?,Q177,Pizza,entity_val_seen
14843,oven_entity_val_00014843,oven_04959287,which type of food is depicted in the image?,Q177,Pizza,entity_val_seen


In [115]:
val_query_seen[val_query_seen['entity_id'] == "Q177"]

Unnamed: 0,data_id,image_id,question,entity_id,entity_text,data_split
95,oven_query_val_00000095,oven_04925834,what type of food does ledo's sell?,Q177,Pizza,query_val_seen
96,oven_query_val_00000096,oven_04925835,what is hot-n-ready?,Q177,Pizza,query_val_seen
97,oven_query_val_00000097,oven_04925836,what company could be sponsoring this game?,Q177,Pizza,query_val_seen
98,oven_query_val_00000098,oven_04925837,what restaurant is in this theme park?,Q177,Pizza,query_val_seen
99,oven_query_val_00000099,oven_04925838,what can i put here?,Q177,Pizza,query_val_seen


In [41]:
tmp1 = set(train_df_entity['entity_id'])
tmp2 = set(val_df_entity['entity_id'])
print(len(tmp1), len(tmp2))
print(len(tmp1.intersection(tmp2)))

7943 3192
1604


In [42]:
tmp1 = set(train_df_entity['question'])
tmp2 = set(val_df_entity['question'])
print(len(tmp1), len(tmp2))
print(len(tmp1.intersection(tmp2)))

595 422
358


In [None]:
# train_image_id_set = set(train_image_id)
# found = False  # Flag to track if any match is found

# val_image_id_new = []
# val_image_id_ = []
# for idx, split in enumerate(val_data_split):
#     if split == "val_unseen_question":
#         val_image_id_new.append(val_image_id[idx])
#     else:
#         val_image_id_.append(val_image_id[idx])

# train_image_id_set = set(train_image_id)
# val_image_id_set = set(val_image_id_new)
# val_image_id_set_ = set(val_image_id_)
# print("Number of train_image_id:", len(train_image_id_set))
# print("Number of val_image_id:", len(val_image_id_set))
# print("Number of val_image_id_:", len(val_image_id_set_))

# # 计算差集，找出仅在 val_image_id_set 中的元素
# unseen_in_train = val_image_id_set.difference(train_image_id_set)

# # 打印差集结果
# # print("Elements in val_image_id not in train_image_id:", unseen_in_train)

# # 判断是否所有val_image_id都在train_image_id中
# if not unseen_in_train:
#     print("All val_image_id elements are present in train_image_id.")
# else:
#     print(f"Not all val_image_id elements are present in train_image_id.  Not in # {len(unseen_in_train)}")

def select_random_subset(input_file, output_file, percentage=0.1):
    with open(input_file, 'r') as file:
        lines = file.readlines()
    
    # Calculate the number of lines to sample
    sample_size = int(len(lines) * percentage)

    # Randomly sample lines
    sampled_lines = random.sample(lines, sample_size)

    # Write the selected lines to a new jsonl file
    with open(output_file, 'w') as outfile:
        for line in sampled_lines:
            outfile.write(line)

# Usage
# for split in ["train", "val", "test", "human"]:
#     select_random_subset(split2data[split], subset_output_path[split], 0.1)
