# `test_split.ipynb`

This noteobook creates test_images.txt, a file that contains the names of the images (file path) in the test set, 1 line at a time.

In [1]:
# config
CAPTIONED_ONLY = True
OUT_FILE       = "test_images.csv"
SEED           = 12734
TEST_SIZE      = 0.1

root_dir       = "/scratch/jl12734/yars"

In [2]:
import json
import os
import pandas as pd

from sklearn.model_selection import train_test_split

In [3]:
def gather_pairs(label: str, no_captions: bool = False):
    """
    takes the label (category of images to fetch) and filters photos.json for images with that label
    filters for images that 
    returns: list of photo objects (photo_id, label, caption, etc.)
    """ 
    with open(os.path.join(root_dir, 'photos.json'), 'r') as file:
        all_photos = []
        all_captions = []
        for line in file:
            line = line.rstrip()
            try:
                photo_record = json.loads(line)
                # filers for only ones wiht captions by default
                has_captions = len(photo_record['caption']) > 0 or no_captions 
                if photo_record['label'] == label and has_captions and photo_record["photo_id"] != "LXT4hCf1lRyUeM4HDBaSvg":
                    all_photos.append(photo_record["photo_id"] + ".jpg")
                    all_captions.append(photo_record['caption'])
            except json.JSONDecodeError as e:
                print(f"Error decoding JSON: {e} at line: {line}")
        return all_photos, all_captions

In [4]:
_, test_photos, _, test_captions = train_test_split(*gather_pairs("food"), test_size=TEST_SIZE, random_state=SEED)

In [5]:
pd.DataFrame({
    "path": test_photos,
    "caption": test_captions,
}).to_csv(OUT_FILE, index=False)

## Testing Area

In [7]:
test_set = {path: caption for path, caption in pd.read_csv(OUT_FILE, lineterminator='\n').values}

In [10]:
list(test_set.keys())

['LPSy3oH5POTrMIa1rR2BYg.jpg',
 'cmeLobDhD4UekH6Awef8Kg.jpg',
 'fI2mgfnfkpX2xvEhpTAjqw.jpg',
 'S71crORK2JlvAVbddmtRRg.jpg',
 'wSnPYhIB75WmxFQWpLgA0A.jpg',
 'HAY7GOI8tFMyvEPSyJvdZQ.jpg',
 'CMnKKiksT7mi-m-WZ_WwXg.jpg',
 'lPOVqRF7LerQHWSXk-y-Xg.jpg',
 '4IY1_slChOn7mYUTLxrYtQ.jpg',
 'vO6XM_Z9gIzlnplzUy4wvQ.jpg',
 'Ioj93rahL9j4z2CfBwWyUg.jpg',
 'R5x1vAJ-JrS-tTPjedGInw.jpg',
 'B-Em20vfapzgsIyfbX24TA.jpg',
 '2sXpvYFG1dET4l7q8leFJg.jpg',
 'MXzAeECuB0wIZbdkZhM50g.jpg',
 'bUfa9sok91q3JjG3wY8xAA.jpg',
 'z3SAioOfwFiAcmplneY4uQ.jpg',
 'AwqR-Cwwj5IsQ-3DAxfGSw.jpg',
 'OpisM3PD87hnLU9UbT5wmQ.jpg',
 'IV7tw78pkBA7i8n1n4hySA.jpg',
 'vJ2vsjykO2GVTm_2jQfZAg.jpg',
 'VbOZrt4fSTNp9CXyHxi8yQ.jpg',
 'OQfU2nm_JOJfdsr_5aZd2g.jpg',
 'jT8Ym5fRwwoe5n770T81-w.jpg',
 '6wuJ-q6_315h_zHjkYtm7g.jpg',
 'WdkTbegg-N1qWgcPmK481A.jpg',
 'x-qREg6vXO_wRE59WO21Bw.jpg',
 'ix90Z9EC51oMmEqzTa4nlA.jpg',
 'wO0OVq8NffIUv_XmjAo-_A.jpg',
 'zpqIzl9KyDbH337go9db7A.jpg',
 '9gq8hvrC2uLFFDlELw8DmQ.jpg',
 'DfpYWXqyCZIDHKKpIdgZfg.jpg',
 '8nhhJl