In [2]:
import os
import json
import random
import pandas as pd
from PIL import Image
from tqdm import tqdm
from sklearn.model_selection import train_test_split

In [13]:
#Load VQA v2 Dataset

DATA_DIR = "/kaggle/input/vqav2-train"

IMG_DIR = f"{DATA_DIR}/train2014/train2014"
Q_PATH = f"{DATA_DIR}/v2_Questions_Train_mscoco/v2_OpenEnded_mscoco_train2014_questions.json"
A_PATH = f"{DATA_DIR}/v2_Annotations_Train_mscoco/v2_mscoco_train2014_annotations.json"

with open(Q_PATH, "r") as f:
    questions = json.load(f)["questions"]

with open(A_PATH, "r") as f:
    annotations = json.load(f)["annotations"]

print("Questions:", len(questions))
print("Annotations:", len(annotations))

Questions: 443757
Annotations: 443757


In [14]:
SAMPLE_SIZE = 5000
indices = random.sample(range(len(questions)), SAMPLE_SIZE)

questions = [questions[i] for i in indices]
annotations = [annotations[i] for i in indices]


In [15]:
#Cleaning & Normalization

def normalize_text(text):
    return text.lower().strip()

clean_data = []

for q, a in zip(questions, annotations):
    question = normalize_text(q["question"])
    
    answers = [normalize_text(ans["answer"]) for ans in a["answers"]]
    
    if len(question.split()) < 3:
        continue
    
    most_common_answer = max(set(answers), key=answers.count)
    
    clean_data.append({
        "image_id": q["image_id"],
        "question": question,
        "answer": most_common_answer
    })

print("Clean samples:", len(clean_data))


Clean samples: 5000


In [17]:
#Image Preprocessing (Resize + RGB)

PROCESSED_IMG_DIR = "/kaggle/working/processed_images"
os.makedirs(PROCESSED_IMG_DIR, exist_ok=True)

def preprocess_image(image_id):
    img_name = f"COCO_train2014_{str(image_id).zfill(12)}.jpg"
    img_path = os.path.join(IMG_DIR, img_name)
    
    img = Image.open(img_path).convert("RGB")
    img = img.resize((224, 224))
    
    save_path = os.path.join(PROCESSED_IMG_DIR, img_name)
    img.save(save_path)
    
    return save_path


In [18]:
final_samples = []

for item in tqdm(clean_data):
    try:
        img_path = preprocess_image(item["image_id"])
        final_samples.append({
            "image_path": img_path,
            "question": item["question"],
            "answer": item["answer"]
        })
    except:
        continue

df = pd.DataFrame(final_samples)
print(df.head())


100%|██████████| 5000/5000 [01:21<00:00, 61.51it/s]

                                          image_path  \
0  /kaggle/working/processed_images/COCO_train201...   
1  /kaggle/working/processed_images/COCO_train201...   
2  /kaggle/working/processed_images/COCO_train201...   
3  /kaggle/working/processed_images/COCO_train201...   
4  /kaggle/working/processed_images/COCO_train201...   

                                  question        answer  
0  who is playing a pickup game of soccer?           men  
1   what color is the little girl's dress?           red  
2                  is this a modern plane?            no  
3                is this boat near a port?            no  
4  what is the title on one of these book?  encyclopedia  





In [19]:
train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

print("Train:", len(train_df))
print("Validation:", len(val_df))
print("Test:", len(test_df))


Train: 4000
Validation: 500
Test: 500


In [20]:
train_df.to_csv("/kaggle/working/train.csv", index=False)
val_df.to_csv("/kaggle/working/val.csv", index=False)
test_df.to_csv("/kaggle/working/test.csv", index=False)