In [1]:
# Path containing .json METADATA files

METADATA_PATH = '../metadata/'

In [2]:
# Preprocessing step to extract image IDs containing only a single individual of the desired gender

# Exact match on desired tag, regex match on bad tag
# Used to filter for pictures of single individuals (e.g. "good" and no tags containing "girl")

def exclusive_filter(good, bad):
    ids=[]
    matchcount = 0

    import json
    import glob

    # Parse through all metadata files in metadata path
    for filename in glob.glob('{}/*.json'.format(METADATA_PATH)):
        with open(filename, 'r', encoding="utf-8") as f:
            counter = 0
            for line in f:
                counter += 1

                if (counter%1000 == 0):
                    print('Parsing object {} in {}.  Found {} matches.'.format(counter, filename, matchcount))

                data = json.loads(line)
                id = data["id"]
                onlymatch = False

                for tag in data["tags"]:
                    if tag["name"] == good:
                        onlymatch = True
                    elif bad in tag["name"]:
                        onlymatch = False
                        break

                # We're only looking at metadata for sfw(ish) images
                sfw = (data["rating"] == 's')

                if not sfw or not onlymatch:
                    continue

                ids.append(int(id))
                matchcount += 1

    ids.sort()
    return ids

In [None]:
# Parse out IDs of all pictures containing exactly one male character and no others

ids = exclusive_filter("1boy", "girl")

with open("single_male.txt", "w") as outfile:
    for post in ids:
        outfile.write("ID: {}\n".format(post))

In [3]:
# Parse out IDs of all pictures containing exactly one female character and no others

ids = exclusive_filter("1girl", "boy")

with open("single_female.txt", "w") as outfile:
    for post in ids:
        outfile.write("ID: {}\n".format(post))

Parsing object 1000 in ../metadata\2017000000000000.json.  Found 388 matches.
Parsing object 2000 in ../metadata\2017000000000000.json.  Found 778 matches.
Parsing object 3000 in ../metadata\2017000000000000.json.  Found 1180 matches.
Parsing object 4000 in ../metadata\2017000000000000.json.  Found 1619 matches.
Parsing object 5000 in ../metadata\2017000000000000.json.  Found 1852 matches.
Parsing object 6000 in ../metadata\2017000000000000.json.  Found 2245 matches.
Parsing object 7000 in ../metadata\2017000000000000.json.  Found 2539 matches.
Parsing object 8000 in ../metadata\2017000000000000.json.  Found 2834 matches.
Parsing object 9000 in ../metadata\2017000000000000.json.  Found 3083 matches.
Parsing object 10000 in ../metadata\2017000000000000.json.  Found 3406 matches.
Parsing object 11000 in ../metadata\2017000000000000.json.  Found 3630 matches.
Parsing object 12000 in ../metadata\2017000000000000.json.  Found 3959 matches.
Parsing object 13000 in ../metadata\201700000000000

In [9]:
# Build a separate folder for all images with matching IDs

import shutil

def copy_images_by_ids(id_file, source_dir, dest_dir):
    with open(id_file, "r") as img_ids:
        ids = []
        content = img_ids.readlines()
        for line in content:
            ids.append(int(line[4:-1]))

        imgcount = 0
        print("Copying all matches to folder...")
        for id in ids:
            imgcount += 1

            try:
                # Dataset paths are based on modulo of last 4 digits to create equally sized directories
                shutil.copy2('{}/0'.format(source_dir) + str(id%1000) + '/' + str(id) + '.jpg', dest_dir)
            except FileNotFoundError:
                print ("Could not copy image with id: {}.  Continuing...".format(id))

            if (imgcount%1000 == 0):
                print("Copying img {}".format(imgcount))


In [None]:
# Copy female matching ID samples to separate path

copy_images_by_ids("single_female.txt", '../512px', './female')

Copying all matches to folder...
Could not copy image with id: 3.  Continuing...
Could not copy image with id: 7.  Continuing...
Could not copy image with id: 8.  Continuing...
Could not copy image with id: 10.  Continuing...
Could not copy image with id: 14.  Continuing...
Could not copy image with id: 17.  Continuing...
Could not copy image with id: 18.  Continuing...
Could not copy image with id: 19.  Continuing...
Could not copy image with id: 22.  Continuing...
Could not copy image with id: 23.  Continuing...
Could not copy image with id: 26.  Continuing...
Could not copy image with id: 28.  Continuing...
Could not copy image with id: 33.  Continuing...
Could not copy image with id: 38.  Continuing...
Could not copy image with id: 41.  Continuing...
Could not copy image with id: 42.  Continuing...
Could not copy image with id: 43.  Continuing...
Could not copy image with id: 44.  Continuing...
Could not copy image with id: 47.  Continuing...
Could not copy image with id: 48.  Cont

In [None]:
# Copy male matching ID samples to separate path

copy_images_by_ids("single_male.txt", '../512px', './male')