In [1]:
%load_ext autoreload
%autoreload explicit

from itertools import chain
from collections import Counter
from functools import cache
import re
from pathlib import Path


from prelib import data_dir
%aimport e6db.utils
from e6db.utils import TagSetNormalizer, tag_categories, tag_category2id

# Create the mappings

By default, the tag strings in this dataset are using the raw e621 format, with underscores. `TagNormalizer` enables renaming the tags and adding new spelling as aliases.

The tag formating choices made here are specific to [this dataset](https://huggingface.co/datasets/k4d3/furry) meant for PDXL training with sd-scripts. You should adapt it to your own dataset, model and trainer.

In [2]:
cat_artist = e6db.utils.tag_category2id['artist']
cat_lore = e6db.utils.tag_category2id['lore']

tagset_normalizer = e6db.utils.TagSetNormalizer(data_dir)
tagid2cat = tagset_normalizer.tag_normalizer.tag_categories

@cache
def tag_mapfun(tag_underscores, tid):
    """
    Maps raw e621 tags to more natural forms.
    Will be run on:

    * The list of output tag strings,
    * Keys from the dictionary mapping strings to ids, contains canonical tag and aliases,
    * Implication source tags that are not used frequently enough to get an id.

    Returns a list, where the first string is the canonical tag used in the output,
    the others are additional aliases used for recognizing the tag.
    """
    cat = tagid2cat[tid] if tid is not None else -1
    tag = tag_underscores.replace('_', ' ')
    tags = [tag, tag_underscores]
    if cat == cat_artist:
        if not tag.startswith('by '):
            # 'by ' is used in the output tags
            tags.insert(0, f'by {tag.removesuffix(' (artist)')}')
        if not tag.endswith('(artist)'):
            artist = tag.removeprefix('by ')
            tags.append(f'{artist} (artist)')
    elif cat == cat_lore and not tag.endswith(' (lore)'):
            tags.append(f'{tag} (lore)')

    escaped_parens = [t.replace('(', r'\(').replace(')', r'\)') for t in tags]
    for t, ep in zip(tags[1:], escaped_parens[1:]):
        if t != ep:
            tags.append(ep)
    if escaped_parens[0] != tags[0]:
        tags.insert(0, escaped_parens[0]) # apparently sd-scripts require escaped parentheses 
    if ':' in tag: # Recognize tags where ':' were replaced by a space
        tags.append(tag.replace(':', ' '))
    return tags


# Note: uses a single function to map both tag recognition and tags in the output, but this will change
tagset_normalizer = tagset_normalizer.map_tags(tag_mapfun, on_conflict="overwrite_rarest") 
# on_conflict: use warn to debug conflicts. silent, overwrite, overwrite_rarest, warn, raise
del tag_mapfun # drop the cache

# Add some underscores back
tag_normalizer = tagset_normalizer.tag_normalizer
tag_normalizer.rename_output('rating explicit', 'rating_explicit')
tag_normalizer.rename_output('rating questionable', 'rating_questionable')
tag_normalizer.rename_output('rating safe', 'rating_safe')
# Custom mappings
tag_normalizer.add_input_mappings('explicit', 'rating_explicit')
tag_normalizer.add_input_mappings('score_explicit', 'rating_explicit')
tag_normalizer.add_input_mappings('safe', 'rating_safe', on_conflict='overwrite')
tag_normalizer.add_input_mappings('score_safe', 'rating_safe')
tag_normalizer.add_input_mappings('questionable', 'rating_questionable', on_conflict='overwrite')
tag_normalizer.add_input_mappings('score_questionable', 'rating_questionable')

In [3]:
tagset_normalizer.map_tags?

[0;31mSignature:[0m
[0mtagset_normalizer[0m[0;34m.[0m[0mmap_tags[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mmapfun[0m[0;34m:[0m [0mCallable[0m[0;34m[[0m[0;34m[[0m[0mstr[0m[0;34m,[0m [0mint[0m [0;34m|[0m [0;32mNone[0m[0;34m][0m[0;34m,[0m [0mstr[0m [0;34m|[0m [0mlist[0m[0;34m[[0m[0mstr[0m[0;34m][0m[0;34m][0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmap_input[0m[0;34m=[0m[0;32mTrue[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmap_output[0m[0;34m=[0m[0;32mTrue[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mon_conflict[0m[0;34m=[0m[0;34m'raise'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m [0;34m->[0m [0;34m'TagSetNormalizer'[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Apply a function to all tag strings.

The provided function will be run on:

* The of list output tag strings,
* Keys from the dictionary mapping strings to ids, contains canonical
  tag and aliases,
* Implication source tags that are not used frequen

# Create the blacklist

In [4]:
# We will use tagset_normalizer.encode()/decode() to normalize tags. Demo:
tags, implied = tagset_normalizer.encode(['solo', 'male', 'canine', 'mammal', 'unknown tag'])
print(f'{tags=} decoded={tagset_normalizer.decode(tags)} {implied=}')

tagset_normalizer.encode?

tags=[6, 5, 17, 'unknown tag'] decoded=['solo', 'male', 'canine', 'unknown tag'] implied={0, 15}


[0;31mSignature:[0m [0mtagset_normalizer[0m[0;34m.[0m[0mencode[0m[0;34m([0m[0mtags[0m[0;34m:[0m [0mIterable[0m[0;34m[[0m[0mstr[0m[0;34m][0m[0;34m,[0m [0mkeep_implied[0m[0;34m=[0m[0;32mFalse[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Encode a list of string as numerical ids and strip implied tags.

Unknown tags are returned as strings.

Returns :

* a list of tag ids and unknown tag strings,
* a list of implied tag ids.
[0;31mFile:[0m      ~/code/e6db/e6db/utils/__init__.py
[0;31mType:[0m      method

In [5]:
# Manual blacklist: a list of e621 tags or unknown tag strings
blacklist = r"""
invalid tag, by conditional dnp, 
hi res, absurd res, superabsurd res, 4k,
uncensored, ambiguous gender,
translation edit, story in description,
non- balls, non- nipples, non- breasts, feet out of frame
"""
blacklist = (t.strip() for t in blacklist.split(','))
blacklist = set(t for t in blacklist if len(t) > 0)
# multiline is ok, but don't forget the comma on line endings
assert not any('\n' in t for t in blacklist)

# blacklist years, digits only tags, and aspect ratios
all_tags = tagset_normalizer.tag_normalizer.idx2tag
RE_BLACKLIST = re.compile(r'(\d+|\d+:\d+)')
blacklist.update(t for t in all_tags if RE_BLACKLIST.fullmatch(t))
# blacklist tags enging with ' at source'
blacklist.update(t for t in all_tags if t.endswith(' at source'))

# Encode the blacklist to ids
blacklist, implied = tagset_normalizer.encode(blacklist)
# Also blacklist tags implied by blacklisted tags
blacklist = set(blacklist) | implied

print(', '.join(tagset_normalizer.decode(blacklist)))

hi res, 32016, 1621, 1897, 2002, 1860, 1940, 1798, 1889, 2011, 1970, 1896, 1845, 2013, story, 3, non- breasts, text, 1936, by third-party edit, absurd res, 9:16, 135:256, 18th century, 1890, 4:7, 1991, 2008, english text, 1914, 11:8, 1857, 2024, 1963, 4:5, 2004, 2:1, 1869, non- nipples, 1983, 1780, 1992, 1862, 256:135, 1962, 8888, 5:6, 1885, translated, 72016, 1846, 2112, hard translated, 1925, story at source, 1873, 1887, 2003, 612, story in description, 1894, 1879, 1880, 1876, smaller version at source, 1974, 1:1, 7:4, 1921, 1545, 1994, 1872, 1993, 20520, 1926, 1833, 1534, 1874, 1900, ambiguous gender, 12, 100000, 1957, 1850, 2005, 10000, 1858, 1:2, 2:3, 1566, 1965, context at source, ancient art, 1945, 682, 1955, 2022, 1956, 1837, 80085, 1975, 196, 1969, 1995, 2023, non- balls, watermarked at source, 1868, 1938, 9:14, 1916, 204, unavailable at source, 1903, 1893, 2021, 20th century, 1899, 1842, 2020, better version at source, 1979, 007, 19th century, 15th century, 2009, 1913, 1768, 

# Edit caption files

Loads, normalize and remove implied tags from a files in a sd-script hierarchy of directories.

Unknown tags are kept untouched.

Specific to kade's format, captions are detected as tags ending with a period and are moved to the end of the file.

In [6]:
RE_SEP = re.compile(r'[,\n]') # Split on commas and newline

dataset_root = Path('~/repos/kade/furry').expanduser()
output_dir = Path('/tmp/furry_fix')
#output_dir = dataset_root # clobber mode

def load_caption(fp):
    tags, captions = [], []
    with open(fp, 'rt') as fd:
        for chunk in RE_SEP.split(fd.read()):
            chunk = chunk.strip()
            if not chunk:
                continue
            if chunk.endswith('.'):
                captions.append(chunk)
            else:
                tags.append(chunk)
    return tags, captions


counter = Counter()
for file in chain(dataset_root.glob('**/*.txt'), dataset_root.glob('**/*.cap*')):
    if 'sample-prompts' in file.name:
        continue
    tags, captions = load_caption(file)
    orig_tags = tags
    tags, implied = tagset_normalizer.encode(tags)
    tags = [t for t in tags if t not in blacklist]
    
    counter.update(tags)
    tags = tagset_normalizer.decode(tags)
    if tags == orig_tags:
        continue

    output_file = output_dir / file.relative_to(dataset_root)
    output_file.parent.mkdir(parents=True, exist_ok=True)
    result = ', '.join(chain(tags, captions))
    with open(output_file, 'wt') as fd:
        fd.write(result)

In [7]:
# Shows tag frequencies in the dataset, not counting the occurrences of implied tags
for tag, count in counter.most_common(n=100):
    if isinstance(tag, int):
        tag_string = tagset_normalizer.tag_normalizer.decode(tag)
        cat = tag_categories[tagset_normalizer.tag_normalizer.tag_categories[tag]]
        print(f'{tag_string:<30} {count=} (e621:{cat})')
    else:
        print(f'{tag:<30} {count=} (unknown)')

solo                           count=10877 (e621:general)
rating_explicit                count=9363 (e621:general)
anthro                         count=7274 (e621:general)
nude                           count=5906 (e621:general)
female                         count=4525 (e621:general)
digital media \(artwork\)      count=4245 (e621:meta)
erection                       count=3952 (e621:general)
male                           count=3821 (e621:general)
looking at viewer              count=3600 (e621:general)
duo                            count=3555 (e621:general)
balls                          count=3492 (e621:general)
tail                           count=3485 (e621:general)
nipples                        count=3474 (e621:general)
open mouth                     count=3264 (e621:general)
smile                          count=3089 (e621:general)
blush                          count=2956 (e621:general)
rating_safe                    count=2764 (e621:general)
white fur                      co

In [8]:
# Top 100 of unknown tags:
print(', '.join([f'{k}({v})' for k,v in counter.most_common() if isinstance(k, str)][:100]))

furry sticker(306), furry with furry(290), animal focus(287), male pubic hair(251), furry with non-furry(243), gynomorph female(224), intersex intersex(140), male masturbation(125), gynomorph male(118), navel hair(107), by spaceengine(94), gynomorph gynomorph(91), male ambiguous(89), tiger boy(87), photo \(medium\)(78), score explicit(63), clothed nude(60), greg rutkowski(54), five nights at freddy's security breach(53), blp(52), avery palmer(50), hamgas(48), bare pectorals(47), digimon \(creature\)(47), cum on boy(39), two-tone skin(39), animal hands(34), black male underwear(29), by jwst(29), leopard ears(28), ground vehicle(27), legendary(27), cum on pectorals(25), white male underwear(25), female ambiguous(25), by hubble(24), andromorph male(22), painting \(medium\)(22), demon horns(19), herm male(19), lion boy(18), herm female(18), dharrel(18), colored nipples(17), krystal(17), shower \(place\)(15), flame-tipped tail(15), genderswap \(mtf\)(14), sidepec(13), animal collar(13), hea