In [None]:
# Path containing .json METADATA files

METADATA_PATH = '../metadata/'

In [None]:
# Preprocessing step to extract image IDs containing only a single individual of the desired gender

# Exact match on desired tag, regex match on bad tag
# Used to filter for pictures of single individuals (e.g. "good" and no tags containing "girl")

def exclusive_filter(good, bad):
    ids=[]
    matchcount = 0

    import json
    import glob

    # Parse through all metadata files in metadata path
    for filename in glob.glob('{}/*.json'.format(METADATA_PATH)):
        with open(filename, 'r', encoding="utf-8") as f:
            counter = 0
            for line in f:
                counter += 1

                if (counter%10000 == 0):
                    print('Parsing object {} in {}.  Found {} matches.'.format(counter, filename, matchcount))

                data = json.loads(line)
                id = data["id"]
                onlymatch = False

                for tag in data["tags"]:
                    if tag["name"] == good:
                        onlymatch = True
                    elif bad in tag["name"]:
                        onlymatch = False
                        break

                # We're only looking at metadata for sfw(ish) images
                sfw = (data["rating"] == 's')

                if not sfw or not onlymatch:
                    continue

                ids.append(int(id))
                matchcount += 1

    ids.sort()
    return ids

In [None]:
# Parse out IDs of all pictures containing exactly one male character and no others

ids = exclusive_filter("1boy", "girl")

with open("single_male.txt", "w") as outfile:
    for post in ids:
        outfile.write("ID: {}\n".format(post))

In [None]:
# Parse out IDs of all pictures containing exactly one female character and no others

ids = exclusive_filter("1girl", "boy")

with open("single_female.txt", "w") as outfile:
    for post in ids:
        outfile.write("ID: {}\n".format(post))

In [None]:
# Build a separate folder for all images with matching IDs

import shutil

def copy_images_by_ids(id_file, source_dir, dest_dir):
    with open(id_file, "r") as img_ids:
        ids = []
        content = img_ids.readlines()
        for line in content:
            ids.append(int(line[4:-1]))

        imgcount = 0
        print("Copying all matches to folder...")
        for id in ids:
            imgcount += 1

            try:
                # Dataset paths are based on modulo of last 4 digits to create equally sized directories
                shutil.copy2('{}/0'.format(source_dir) + str(id%1000) + '/' + str(id) + '.jpg', dest_dir)
            except FileNotFoundError:
                print ("Could not copy image with id: {}.  Continuing...".format(id))

            if (imgcount%1000 == 0):
                print("Copying img {}".format(imgcount))


In [None]:
# Copy female matching ID samples to separate path

copy_images_by_ids("single_female.txt", '../512px', './female')

In [None]:
# Copy male matching ID samples to separate path

copy_images_by_ids("single_male.txt", '../512px', './male')