In [26]:
import os, shutil
import cv2
import random
from collections import defaultdict

In [2]:
dataset_path = "fgvc-aircraft-2013b/data"

Crop images

In [3]:
with open(f"{dataset_path}/images_box.txt", "r") as file:
    images_box = [line.strip() for line in file.readlines()]

In [6]:
cropped_img_dir = f"{dataset_path}/images_cropped"
os.mkdir(cropped_img_dir)

for img in images_box:
    img_name, xmin, ymin, xmax, ymax = img.split()
    
    img_name += ".jpg"
    xmin, ymin, xmax, ymax = int(xmin), int(ymin), int(xmax), int(ymax)

    image = cv2.imread(f"{dataset_path}/images/{img_name}")
    if image is None:
        raise FileNotFoundError(f"image {img_name} not found")
    
    image_crop = image[ymin:ymax, xmin:xmax]

    cv2.imwrite(f"{cropped_img_dir}/{img_name}", image_crop)


Number of classes and samples in each class

In [8]:
with open(f"{dataset_path}/families.txt", "r") as file:
    families = [line.strip() for line in file.readlines()]

families = set(families)
len(families)

70

In [14]:
SAMPLE_COUNT = defaultdict(int)

with open(f"{dataset_path}/images_family_train.txt", "r") as file:
    train_samples = [line.strip() for line in file.readlines()]

for sample in train_samples:
    img_name, fam_name = sample.split(maxsplit=1)
    SAMPLE_COUNT[fam_name] += 1


In [18]:
with open(f"{dataset_path}/images_family_val.txt", "r") as file:
    val_samples = [line.strip() for line in file.readlines()]

for sample in val_samples:
    img_name, fam_name = sample.split(maxsplit=1)
    SAMPLE_COUNT[fam_name] += 1


with open(f"{dataset_path}/images_family_test.txt", "r") as file:
    test_samples = [line.strip() for line in file.readlines()]

for sample in test_samples:
    img_name, fam_name = sample.split(maxsplit=1)
    SAMPLE_COUNT[fam_name] += 1


In [23]:
SAMPLE_COUNT

defaultdict(int,
            {'Boeing 707': 100,
             'Boeing 727': 100,
             'Boeing 737': 800,
             'Boeing 747': 400,
             'Boeing 757': 200,
             'Boeing 767': 300,
             'Boeing 777': 200,
             'A300': 100,
             'A310': 100,
             'A320': 400,
             'A330': 200,
             'A340': 400,
             'A380': 100,
             'ATR-42': 100,
             'ATR-72': 100,
             'An-12': 100,
             'BAE 146': 200,
             'BAE-125': 100,
             'Beechcraft 1900': 100,
             'Boeing 717': 100,
             'C-130': 100,
             'C-47': 100,
             'CRJ-200': 100,
             'CRJ-700': 200,
             'Cessna 172': 100,
             'Cessna 208': 100,
             'Cessna Citation': 200,
             'Challenger 600': 100,
             'DC-10': 100,
             'DC-3': 100,
             'DC-6': 100,
             'DC-8': 100,
             'DC-9': 100,
             '

Create lists to split the dataset (80-10-10 evenly across classes)

In [24]:
AIRCRAFT = defaultdict(list)

filelists = ["images_family_train.txt", "images_family_val.txt", "images_family_test.txt"]

for flist in filelists:
    with open(f"{dataset_path}/{flist}", "r") as file:
        samples = [line.strip() for line in file.readlines()]

    for s in samples:
        img_name, fam_name = s.split(maxsplit=1)
        img_name += ".jpg"

        AIRCRAFT[fam_name].append(img_name)


In [30]:
train_filelist, val_filelist, test_filelist = list(), list(), list()

for fam_name, img_list in AIRCRAFT.items():
    random.shuffle(img_list)

    n = len(img_list)
    n_train = int(0.8 * n)
    n_val = int(0.1 * n)
    n_test = n - n_train - n_val

    train_filelist.extend((img, fam_name) for img in img_list[:n_train])
    val_filelist.extend((img, fam_name) for img in img_list[n_train:n_train + n_val])
    test_filelist.extend((img, fam_name) for img in img_list[n_train + n_val:])

print(len(train_filelist))
print(len(val_filelist))
print(len(test_filelist))

def save_filelist(filename, split):
    with open(filename, "w") as f:
        for img, fam in split:
            f.write(f"{img} {fam}\n")

save_filelist(f"{dataset_path}/train_filelist.txt", train_filelist)
save_filelist(f"{dataset_path}/val_filelist.txt", val_filelist)
save_filelist(f"{dataset_path}/test_filelist.txt", test_filelist)


8000
1000
1000


Training, validation, and test sets

In [31]:
splits = {
    "train": train_filelist,
    "val": val_filelist,
    "test": test_filelist
}

for split_name, filelist in splits.items():
    split_dir = os.path.join(dataset_path, split_name)

    for img_name, fam_name in filelist:
        class_dir = os.path.join(split_dir, fam_name)
        os.makedirs(class_dir, exist_ok=True)

        src_path = os.path.join(dataset_path, "images_cropped - Copy", img_name)
        dst_path = os.path.join(class_dir, img_name)

        shutil.move(src_path, dst_path)
