In [1]:
import psycopg2
import json
from tqdm import tqdm
from pathlib import Path

In [2]:
def get_db_connection():
	return psycopg2.connect(host=(Path.cwd() / ".." / "pg-socket").absolute(), database="postgres", user="postgres")


def read_tag_aliases():
	"""Returns a dictionary of tag aliases. Given a tag like "ff7" as key, for example, the value would be "final_fantasy_vii"."""
	aliases = {}

	with open('../metadata/tag_aliases000000000000.json', 'r') as f:
		aliases = {}

		for line in f:
			alias = json.loads(line)
			aliases[alias['antecedent_name']] = alias['consequent_name']
	
	return aliases


def read_tag_implications():
	"""Returns a dictionary of tag implications. Given a tag like "mouse_ears" as key, for example, the value would be "animal_ears"."""
	implications = {}

	with open('../metadata/tag_implications000000000000.json', 'r') as f:
		implications = {}

		for line in f:
			implication = json.loads(line)
			implications[implication['antecedent_name']] = implication['consequent_name']
	
	return implications


def read_tag_blacklist():
	"""Returns a set of blacklisted tags."""
	with open('tag_blacklist.txt', 'r') as f:
		return set(l.strip() for l in f.read().splitlines() if l.strip() != '')

In [3]:
# Count tag usage of all valid posts
# Tag aliases are applied to canonicalize tags
# Tag implications are applied to expand tags and make sure general tags are counted correctly.  For example, "mouse_ears" is an implication of "animal_ears", so if a post has "mouse_ears" it should be counted as having "animal_ears" as well.
db = get_db_connection()

print("Reading tag aliases and implications...")
tag_aliases = read_tag_aliases()
tag_implications = read_tag_implications()

# Canonicalize tag implications
for tag, implied_tag in list(tag_implications.items()):
	# Canonicalize the implied tag
	if implied_tag in tag_aliases:
		tag_implications[tag] = tag_aliases[implied_tag]
	
	# Canonicalize the tag
	if tag in tag_aliases:
		tag_implications[tag_aliases[tag]] = tag_implications[tag]

print("Counting posts...")
with db.cursor() as cur:
	# Only count posts with embeddings (this excludes gif posts, for example)
	cur.execute("SELECT COUNT(*) FROM metadata INNER JOIN embeddings ON metadata.file_hash = embeddings.hash")
	total_posts, = cur.fetchone()

print("Reading posts...")
tag_counts = {}
min_tags = None
max_tags = None
mean_tags = 0
mean_tags_count = 0

with db.cursor("tag_query") as cur:
	cur.execute("SELECT tag_string, score FROM metadata INNER JOIN embeddings ON metadata.file_hash = embeddings.hash")

	for tag_string, score in tqdm(cur, total=total_posts):
		post_tags = set(t.strip() for t in tag_string.split(' '))

		# Apply tag aliases
		post_tags = set(tag_aliases.get(tag, tag) for tag in post_tags)

		# Apply tag implications
		post_tags.update(set(tag_implications.get(tag, tag) for tag in post_tags))

		for tag in post_tags:
			tag_counts[tag] = tag_counts.get(tag, 0) + 1
		
		if min_tags is None or len(post_tags) < min_tags:
			min_tags = len(post_tags)
		
		if max_tags is None or len(post_tags) > max_tags:
			max_tags = len(post_tags)
		
		mean_tags += len(post_tags)
		mean_tags_count += 1

mean_tags /= mean_tags_count

print(f"Min tags: {min_tags}")
print(f"Max tags: {max_tags}")
print(f"Mean tags: {mean_tags}")

Reading tag aliases and implications...
Counting posts...
Reading posts...


100%|██████████| 4620889/4620889 [09:02<00:00, 8518.67it/s] 

Min tags: 1
Max tags: 1160
Mean tags: 32.634047907231704





In [11]:
# Remove blacklisted tags
tag_blacklist = read_tag_blacklist()

for tag in tag_blacklist:
	if tag in tag_counts:
		del tag_counts[tag]

In [12]:
# Sort tags by count
tags = sorted(tag_counts.items(), key=lambda x: x[1], reverse=True)

In [13]:
print(f"Found {len(tags)} tags")
print(f"Found {len([tag for tag, count in tags if count >= 10000])} tags with at least 10,000 usage")
print(f"Found {len([tag for tag, count in tags if count >= 1000])} tags with at least 1,000 usage")

Found 482000 tags
Found 1552 tags with at least 10,000 usage
Found 6813 tags with at least 1,000 usage


In [14]:
# Top 6000 tags
top_tags = tags[:6000]

In [15]:
# Write top tags to file
with open('top_tags.txt', 'w') as f:
	for tag, count in top_tags:
		f.write(tag + '\n')

In [16]:
print('safe' in top_tags)
print('questionable' in top_tags)
print('nsfw' in top_tags)
print('worst_quality' in top_tags)
print('low_quality' in top_tags)
print('medium_quality' in top_tags)
print('high_quality' in top_tags)
print('best_quality' in top_tags)
print('masterpiece' in top_tags)

False
False
False
False
False
False
False
False
False
