In [5]:
import numpy as np
import os
import pandas as pd

# load pkl file
class_info = np.load("cub_osr_splits.pkl",allow_pickle=True)

In [14]:
train_classes = class_info['known_classes']
train_classes_new_index = dict()
for cls_orig, cls_new in zip(np.unique(train_classes),range(len(np.unique(train_classes)))):
    train_classes_new_index[cls_orig] = cls_new

open_set_classes = class_info['unknown_classes']
open_set_classes_dict = {
    "all": open_set_classes['Hard'] + open_set_classes['Medium'] + open_set_classes['Easy'],
    "easy": open_set_classes['Easy'],
    "medium": open_set_classes['Medium'],
    "hard": open_set_classes['Hard']
}
print(f"{len(train_classes)} train classes and {len(open_set_classes_dict['all'])} unseen")

100 train classes and 100


In [15]:
np.random.seed(2)

def find_classes(classes_file):

    # read classes file, separating out image IDs and class names
    image_ids = []
    targets = []
    f = open(classes_file, 'r')
    for line in f:
        split_line = line.split(' ')
        image_ids.append(split_line[0])
        targets.append(' '.join(split_line[1:]))
    f.close()

    # index class names
    classes = np.unique(targets)
    class_to_idx = {classes[i]: i for i in range(len(classes))}
    targets = [class_to_idx[c] for c in targets]

    return (image_ids, targets, classes, class_to_idx)

def subsample_dataset(dataset, idxs):
    imgs,targets = dataset

    imgs_sub = [p for i, (p, t) in enumerate(zip(imgs,targets)) if i in idxs]
    targets_sub = [t for i, (p, t) in enumerate(zip(imgs,targets)) if i in idxs]

    return (imgs_sub, targets_sub)

def get_train_val_split(image_ids,targets, val_split=0.2):

    val_dataset = (image_ids,targets)
    train_dataset = (image_ids,targets)

    train_classes = np.unique(targets)

    # Get train/test indices
    train_idxs = []
    val_idxs = []
    for cls in train_classes:

        cls_idxs = np.where(targets == cls)[0]
        #print(f"{cls} has {len(cls_idxs)} examples")

        v_ = np.random.choice(cls_idxs, replace=False, size=((int(val_split * len(cls_idxs))),))
        t_ = [x for x in cls_idxs if x not in v_]
        #print(f"{len(t_)} going in train")

        train_idxs.extend(t_)
        val_idxs.extend(v_)

    # Get training/validation datasets based on selected idxs
    train_dataset = subsample_dataset(train_dataset, train_idxs)
    val_dataset = subsample_dataset(val_dataset, val_idxs)

    return train_dataset, val_dataset

class_type='variant'
split='train'

# classes_file = os.path.join('data', 'images_%s_%s.txt' % (class_type, "trainval"))
# (image_ids, targets, classes, class_to_idx) = find_classes(classes_file)


images = pd.read_csv(os.path.join('CUB_200_2011', 'images.txt'), sep=' ',
                     names=['img_id', 'filepath'])
image_class_labels = pd.read_csv(os.path.join('CUB_200_2011', 'image_class_labels.txt'),
                                 sep=' ', names=['img_id', 'target'])
train_test_split = pd.read_csv(os.path.join('CUB_200_2011', 'train_test_split.txt'),
                               sep=' ', names=['img_id', 'is_training_img'])

data = images.merge(image_class_labels, on='img_id')
data = data.merge(train_test_split, on='img_id')
print(data.head())

data_training = data[data["is_training_img"] == 1]
data_test = data[data["is_training_img"] == 0]

datasets = dict()
datasets["train"], datasets["val"] = get_train_val_split(data_training["img_id"], data_training["target"])


for split in ["train","val","test","ood_easy","ood_medium","ood_hard"]:
    split_fgvc = "test" if "ood" in split else split

    if "ood" in split:
        difficulty = split.split("_")[-1]

    if split_fgvc == "test":
        df_split = data_test
    elif split_fgvc in ["train","val"]:
        image_ids, targets = datasets[split_fgvc]
        df_split = data_training[data_training["img_id"].isin(image_ids)]
    
    res_list = []
    
    for _,row in df_split.iterrows():
        #print(image_id, cls)
        idx = row["target"]-1
        if split in ["train","val","test"] and idx in open_set_classes_dict["all"]:
            continue
        elif "ood" in split and idx not in open_set_classes_dict[difficulty]:
            continue
        res_list.append({
            'image_path': f"CUB_200_2011/CUB_200_2011/images/{row['filepath']}",
            'label': train_classes_new_index[idx] if split in ["train","val","test"] else -1
        })

    suffix = "clean" if split in ["train","val","test"] else difficulty
    
    df = pd.DataFrame(res_list)
    print(df.head())
    print(f"{len(df['label'].unique())} unique classes in {split}")
    assert len(df['image_path'].unique()) == len(df)
    df.to_csv(f"../../benchmark_imglist/fgvc-cub/{split.split('_')[0]}_fgvc-cub_{suffix}.txt", sep=" ", header=False, index=False)

   img_id                                           filepath  target  \
0       1  001.Black_footed_Albatross/Black_Footed_Albatr...       1   
1       2  001.Black_footed_Albatross/Black_Footed_Albatr...       1   
2       3  001.Black_footed_Albatross/Black_Footed_Albatr...       1   
3       4  001.Black_footed_Albatross/Black_Footed_Albatr...       1   
4       5  001.Black_footed_Albatross/Black_Footed_Albatr...       1   

   is_training_img  
0                0  
1                1  
2                0  
3                1  
4                1  
                                          image_path  label
0  CUB_200_2011/CUB_200_2011/images/001.Black_foo...      0
1  CUB_200_2011/CUB_200_2011/images/001.Black_foo...      0
2  CUB_200_2011/CUB_200_2011/images/001.Black_foo...      0
3  CUB_200_2011/CUB_200_2011/images/001.Black_foo...      0
4  CUB_200_2011/CUB_200_2011/images/001.Black_foo...      0
100 unique classes in train
                                          image_path 