In [4]:
import spacy
from spacy.tokens import Doc, DocBin
import psycopg
from pathlib import Path
from tqdm import tqdm
import sys
from collections import defaultdict
import csv

In [26]:
with open("n-grams.csv", "r") as f:
	reader = csv.reader(f)
	ngrams = list(reader)

# Filter
header = ngrams.pop(0)
ngrams = [ngram for ngram in ngrams if int(ngram[2]) > 1000]

# Save
with open("n-grams-filtered.csv", "w") as f:
	writer = csv.writer(f)
	writer.writerow(("ngram", "text", "occurances"))
	writer.writerows(ngrams)

In [24]:
ngrams[1]

['1', 'and', '10532943']

In [27]:
conn = psycopg.connect(dbname='postgres', user='postgres', host=str(Path.cwd() / "pg-socket"))

with conn.cursor('dataset-builder') as cur:
	cur.execute("SELECT tag_string, subreddit, caption, caption_2, caption_3, caption_4, source FROM images WHERE embedding IS NOT NULL AND score IS NOT NULL AND score > 0 AND tag_string IS NOT NULL and caption IS NOT NULL")
	records = []

	for tag_string, subreddit, caption, caption_2, caption_3, caption_4, source in tqdm(cur, desc="Reading records", dynamic_ncols=True):
		caption = caption
		if caption_2 is not None:
			caption = caption_2
		if caption_3 is not None:
			caption = caption_3
		if caption_4 is not None:
			caption = caption_4
		
		records.append((tag_string, subreddit, caption, source))

Reading records: 6716761it [01:17, 86208.74it/s] 


In [28]:
tag_counts = defaultdict(int)
VALID_SOURCES = {"fansly", "flickr", "onlyfans", "unsplash"}

for tag_string, subreddit, caption, source in tqdm(records, desc="Counting tags", dynamic_ncols=True):
	tags = tag_string.split(",")
	if subreddit is not None:
		tags.append(f"r/{subreddit.lower()}")
		tags.append("reddit")
	
	if source is not None and source in VALID_SOURCES:
		tags.append(source)
	
	for tag in tags:
		tag_counts[tag] += 1

with open("tag-counts.csv", "w") as f:
	writer = csv.writer(f)
	writer.writerow(("tag", "count"))
	for tag, count in sorted(tag_counts.items(), key=lambda x: x[1], reverse=True):
		if count < 1000:
			break
		
		writer.writerow((tag, count))

Counting tags: 100%|██████████| 6716761/6716761 [00:43<00:00, 155171.28it/s]


In [3]:
def print_spacy_doc_info(doc: Doc):
	"""
	Prints comprehensive information for each token in a spaCy Doc object.

	Parameters:
	doc (spacy.tokens.Doc): A spaCy Doc object.
	"""
	# Print basic token-level information
	print("Tokens and Attributes:")
	print(f"{'Text':{10}} {'Lemma':{10}} {'POS':{6}} {'Tag':{6}} {'Dep':{10}} {'Shape':{8}} {'Is Alpha':{10}} {'Is Stop':{10}}")
	print("="*80)

	for token in doc:
		print(f"{token.text:{10}} {token.lemma_:{10}} {token.pos_:{6}} {token.tag_:{6}} {token.dep_:{10}} {token.shape_:{8}} {str(token.is_alpha):{10}} {str(token.is_stop):{10}}")

	# Print named entities
	print("\nNamed Entities:")
	print(f"{'Entity':{20}} {'Label':{10}} {'Start':{6}} {'End':{6}}")
	print("="*50)
	for ent in doc.ents:
		print(f"{ent.text:{20}} {ent.label_:{10}} {ent.start_char:{6}} {ent.end_char:{6}}")

	# Print noun chunks
	print("\nNoun Chunks:")
	print("="*50)
	for chunk in doc.noun_chunks:
		print(f"{chunk.text:{20}} - Root: {chunk.root.text} - Dep: {chunk.root.dep_} - Head: {chunk.root.head.text}")

	# Print sentences
	print("\nSentences:")
	print("="*50)
	for sent in doc.sents:
		print(f"Sentence: {sent.text}")

In [None]:
nlp = spacy.load("en_core_web_sm")

for tag_string, subreddit, caption, source in tqdm(records, desc="Processing records", dynamic_ncols=True):
	doc = nlp(caption)

	print_spacy_doc_info(doc)

	print("\n" * 2)

	nlp = spacy.load("en_core_web_md")
	doc = nlp(caption)
	print_spacy_doc_info(doc)

	print("\n" * 2)

	nlp = spacy.load("en_core_web_lg")
	doc = nlp(caption)
	print_spacy_doc_info(doc)
	break

In [5]:
captions = [caption for tag_string, subreddit, caption, source in tqdm(records)]

100%|██████████| 6716761/6716761 [00:01<00:00, 4491377.36it/s]


In [22]:
nlp = spacy.load("en_core_web_sm")
docs = list(tqdm(nlp.pipe(captions, batch_size=256, n_process=16), desc="Processing captions", total=len(captions)))

Processing captions: 100%|██████████| 6716761/6716761 [57:19<00:00, 1953.01it/s]  


In [26]:
for i in tqdm(range(0, len(docs), 1000)):
	doc_bin = DocBin(store_user_data=True)
	for doc in docs[i:i+1000]:
		doc_bin.add(doc)
	dst = Path.cwd() / "spacy-captions" / f"captions-{i}.spacy"
	dst.parent.mkdir(parents=True, exist_ok=True)
	doc_bin.to_disk(dst)

100%|██████████| 6717/6717 [55:23<00:00,  2.02it/s]


In [14]:
def merge_noun_phrases(doc: Doc) -> list[str]:
	merged_tokens = []
	start = 0

	for chunk in doc.noun_chunks:
		merged_tokens.extend([token.text for token in doc[start:chunk.start] if not token.is_punct])
		merged_tokens.append(chunk.text)
		start = chunk.end
	
	merged_tokens.extend([token.text for token in doc[start:] if not token.is_punct])

	return merged_tokens

In [30]:
import gc
gc.collect()

709

In [None]:
from multiprocessing import Pool


def parse_docs(doc_path: Path) -> dict[int, dict[str, int]]:
	n_grams = {n: defaultdict(int) for n in range(1, 8)}
	nlp = spacy.load("en_core_web_sm")

	doc_bin = DocBin().from_disk(doc_path)
	docs = list(doc_bin.get_docs(nlp.vocab))

	for doc in docs:
		tokens = merge_noun_phrases(doc)

		for n in n_grams.keys():
			for i in range(len(tokens) - n + 1):
				n_gram = " ".join([tokens[j].lower() for j in range(i, i + n)])
				n_gram = n_gram.strip()
				n_grams[n][n_gram] += 1
	
	return n_grams


all_n_grams = {n: defaultdict(int) for n in range(1, 8)}

saved_docs = list((Path.cwd() / "spacy-captions").glob("*.spacy"))

with Pool(16) as pool:
	for result in tqdm(pool.imap_unordered(parse_docs, saved_docs), total=len(saved_docs), desc="Reading captions", dynamic_ncols=True):
		for n, n_grams in result.items():
			for n_gram, count in n_grams.items():
				all_n_grams[n][n_gram] += count


for n, n_gram in all_n_grams.items():
	print(f"{n}-grams:")
	for n_gram, count in sorted(n_gram.items(), key=lambda x: x[1], reverse=True)[:10]:
		print(f"{n_gram}: {count}")
	
	print("\n" * 2)

In [19]:
# Save the n-grams to a csv file
rows = []
for n, n_gram in tqdm(all_n_grams.items()):
	for n_gram, count in sorted(n_gram.items(), key=lambda x: x[1], reverse=True)[:10000]:
		rows.append((n, n_gram, count))

with open("n-grams.csv", "w") as f:
	writer = csv.writer(f)
	writer.writerow(["n", "n-gram", "count"])
	writer.writerows(rows)

100%|██████████| 7/7 [02:05<00:00, 18.00s/it]


In [None]:
n_grams = {n: defaultdict(int) for n in range(1, 8)}
gc.collect()

for doc in tqdm(docs, desc="Processing captions", dynamic_ncols=True):
	tokens = merge_noun_phrases(doc)

	for n in n_grams.keys():
		for i in range(len(tokens) - n + 1):
			n_gram = " ".join([tokens[j].lower() for j in range(i, i + n)])
			n_grams[n][n_gram] += 1

for n, n_gram in n_grams.items():
	print(f"{n}-grams:")
	for n_gram, count in sorted(n_gram.items(), key=lambda x: x[1], reverse=True)[:10]:
		print(f"{n_gram}: {count}")
	
	print("\n" * 2)

In [33]:
# Save the n-grams to a file
with open("n-grams.txt", "w") as f:
	for n, n_gram in tqdm(n_grams.items()):
		for n_gram, count in sorted(n_gram.items(), key=lambda x: x[1], reverse=True)[:10000]:
			f.write(f"{n},{n_gram},{count}\n")

100%|██████████| 7/7 [02:11<00:00, 18.86s/it]


In [None]:
n_grams = {n: defaultdict(int) for n in range(1, 8)}

for doc in tqdm(docs):
	tokens = [token for token in doc if not token.is_punct]

	for n in n_grams.keys():
		for i in range(len(tokens) - n + 1):
			n_gram = " ".join([tokens[j].text.lower() for j in range(i, i + n)])
			n_grams[n][n_gram] += 1

for n, n_gram in n_grams.items():
	print(f"{n}-grams:")
	for n_gram, count in sorted(n_gram.items(), key=lambda x: x[1], reverse=True)[:10]:
		print(f"{n_gram}: {count}")
	
	print("\n" * 2)