# Memes Library Builder

This library contains tens of thousands of memes organized in hundreds of folders by topic. This notebook builds the master json file which contains a list of all the topics, all the memes in each topic, and a list of any metadata associated with each meme.


In [1]:
## Import Libraries

import os
import json
from glob import glob
from pathlib import Path


In [2]:
## Define Constants

MEMES_ROOT = Path('memes')


# Create missing first-seen files

Iterate recursively through all the subdirectories of the ./memes folder and for any file with one of the following extensions, check if it already has a file with the same name in the same directory but appended with first-seen.txt. So for example meme.jpg meansthere should also be a file called meme.jpg.first-seen.txt  

If not, create a new file containing the file modification time as a python datetime. This file should have the same name but appended with first-seen.txt. So for example meme.jpg means we need to create a file called meme.jpg.first-seen.txt containing the python datetime of the modification time of the meme.jpg file.  

- .gif
- .jfif
- .jpeg
- .jpg
- .mp4
- .png
- .svg
- .webp




In [3]:
from pathlib import Path
from datetime import datetime, timezone


TRACKED_EXTS = {
    ".gif", ".jfif", ".jpeg", ".jpg",
    ".mp4", ".png", ".svg", ".webp",
}

def ensure_first_seen_files(root: Path, exts: set[str]) -> dict[str, int]:
    created = 0
    skipped = 0

    for file in root.rglob("*"):
        if file.suffix.lower() not in exts or not file.is_file():
            continue

        meta_path = file.with_name(file.name + ".first-seen.txt")

        if meta_path.exists():
            skipped += 1
            continue

        # ▶ grab mtime and convert to UTC datetime
        mod_time = datetime.fromtimestamp(file.stat().st_mtime,
                                          tz=timezone.utc)
        # ▶ write ISO-8601 string
        meta_path.write_text(mod_time.isoformat())

        created += 1

    return {"created": created, "skipped": skipped}

summary = ensure_first_seen_files(MEMES_ROOT, TRACKED_EXTS)
print(f"First-seen files created: {summary['created']}")
print(f"Already present / skipped : {summary['skipped']}")

First-seen files created: 0
Already present / skipped : 8408


# Build the master memes.json file

The memes are organized like so;
/memes/Topic 1
/memes/Topic 2

Memes can be images or videos. Assume all the common file extensions will be present. For each meme, a number of metadata files may be present. These should be included if present.

For example;
memefilename.jpg
memefilename.jpg.txt <- Canonical tesseract-OCR of the meme. Might be nonsense. Probably we won't need this but it's there, so lets include it in the json file.
memefilename.jpg.llama-3.2-vision.txt <- High quality transformer analysis of the image, containing detailed explanation of the visual elements of the image, including any text, but probably lacking an awareness of any social context or any relationship to current events.

Example output:

'Topic 1' => {
    1 => {
        'file' => 'memes/topic/filename.jpg',
        'filemtime' => 'date the file was last modified',
        'metadata' => {
            'tesseract-ocr' => 'memes/topic/filename.jpg.txt',
            'llama-3.2-vision' => 'memes/topic/filename.jpg.llama-3.2-vision.txt'
        }
    }
}

The list of memes in each topic must be ordered by filemtime descending, such that the most recently added item is number 1. 

In [4]:
! pip install html5lib

# Helper functions
import html

from html5lib.serializer import escape as _h5_escape

# ──────────────────────────────────────────────────────────────────────
#  HTML-5 escaping  +  extra Liquid-safety for curly braces
# ──────────────────────────────────────────────────────────────────────
def escape_entities(text: str) -> str:
    """
    • Encode &, <, >, quotes (html5lib default).
    • ALSO encode { and } so accidental `{{` / `{%` from OCR
      can’t be parsed by Liquid.
    • Collapse newlines to single spaces.
    """
    return (
        _h5_escape(
            text,
            entities={
                '"': "&quot;",      # keep attribute-safe quotes
                "'": "&#x27;",      # HTML5‐preferred apostrophe
                "{": "&#123;",      # <-- Liquid-safety
                "}": "&#125;",      # <-- Liquid-safety
            },
        )
        .replace("\n", " ")
    )


def read_text_multi(path: Path, encodings=('utf-8', 'utf-8-sig', 'cp1252', 'latin-1')):
    "Try multiple encodings and fall back to replacement."
    for enc in encodings:
        try:
            return path.read_text(encoding=enc)
        except UnicodeDecodeError:
            pass
    return path.read_text(encoding=encodings[0], errors='replace')

def get_file_timestamp(file: Path) -> datetime:
    
    file = Path(file)
    
    meta_path = file.with_name(file.name + ".first-seen.txt")

    if meta_path.exists():
        # Read the timestamp from first-seen.txt
        mod_time_str = read_text_multi(meta_path).strip()
        return mod_time_str
    else:
        # Fallback to current modification time
        return datetime.fromtimestamp(file.stat().st_mtime, tz=timezone.utc).isoformat()
    
def get_topics():
    """Return list of topic folders in the memes directory"""
    return [p.name for p in MEMES_ROOT.iterdir() if p.is_dir()]

def process_topic(topic: str) -> dict[int, dict]:
    topic_path = Path(MEMES_ROOT) / topic
    memes: list[dict] = []

    for meme_file in topic_path.iterdir():
        if meme_file.suffix.lower() not in TRACKED_EXTS or not meme_file.is_file():
            continue

        memes.append({
            "file": str(meme_file),
            "filemtime": get_file_timestamp(meme_file),   # dt obj
            "metadata": {
                k: str(meme_file.with_name(meme_file.name + suffix))
                for k, suffix in {
                    "tesseract-ocr": ".txt",
                    "llama-3.2-vision": ".llama-3.2-vision.txt"
                }.items()
                if meme_file.with_name(meme_file.name + suffix).exists()
            }
        })

    # newest first
    memes.sort(key=lambda m: m["filemtime"], reverse=True)

    # re-index so 1 == newest
    return {i: _serialise(meme) for i, meme in enumerate(memes, 1)}

def _serialise(meme: dict) -> dict:
    """Convert dt → iso-string without microseconds so JSON dump works."""
    meme = meme.copy()
    meme["filemtime"] = meme["filemtime"]
    return meme

def build_master_json():
    """Build the master JSON file with all topics and all memes"""
    master_dict = {}

    for topic in get_topics():
        master_dict[topic] = process_topic(topic)

    # Sort each topic's memes by filemtime descending
    for topic, memes in master_dict.items():
        sorted_memes = dict(sorted(memes.items(), key=lambda item: item[1]['filemtime'], reverse=True))
        master_dict[topic] = sorted_memes

    return master_dict

master_json_data = build_master_json()


## Save JSON File

with open('memes.json', 'w') as json_file:
    json.dump(master_json_data, json_file, indent=4)



[notice] A new release of pip is available: 23.2.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip




# Generate missing llama-3.2-vision.txt files



In [5]:
import base64, requests, textwrap, time
from pathlib import Path
from requests.exceptions import RequestException

MODEL_NAME   = "llama3.2-vision:11b"
OLLAMA_URL   = "http://docker-ai:11434/api/generate"
PROMPT       = (
    "In 2-3 sentences, describe this meme for someone who cannot see it. "
    "Include any text that appears in the image."
)
IMAGE_EXTS   = {".jpeg", ".jpg", ".png"}

MAX_RETRIES  = 3        # total attempts per image
INITIAL_WAIT = 5        # seconds before first retry (doubles each time)
def format_duration(seconds):
    seconds = int(seconds)
    mins, sec = divmod(seconds, 60)
    hrs, mins = divmod(mins, 60)
    days, hrs = divmod(hrs, 24)
    parts = []
    if days: parts.append(f'{days}d')
    if hrs: parts.append(f'{hrs}h')
    if mins: parts.append(f'{mins}m')
    if sec or not parts: parts.append(f'{sec}s')
    return ' '.join(parts)


def _meta_path(img: Path) -> Path:
    return img.with_name(f"{img.name}.llama3.2-vision.txt")

def _summarise_image(img: Path) -> str:
    """Call Ollama with retries; raise after MAX_RETRIES failures."""
    img_b64  = base64.b64encode(img.read_bytes()).decode()
    payload  = {
        "model": MODEL_NAME,
        "prompt": PROMPT,
        "stream": False,
        "images": [img_b64],
    }
    for attempt in range(1, MAX_RETRIES + 1):
        try:
            r = requests.post(OLLAMA_URL, json=payload, timeout=300)
            r.raise_for_status()
            summary = r.json().get("response", "").strip()
            if not summary:
                raise ValueError("API returned empty 'response'")
            return summary
        except (RequestException, ValueError) as err:
            wait = INITIAL_WAIT * 2 ** (attempt - 1)
            print(f"[{img.name}] attempt {attempt}/{MAX_RETRIES} failed: {err}")
            if attempt < MAX_RETRIES:
                print(f"   → retrying in {wait}s …")
                time.sleep(wait)
            else:
                raise  # bubbled up to main loop

def create_all_summaries(root=MEMES_ROOT):
    root = Path(root)
    images = [
        img for img in root.rglob('*')
        if img.suffix.lower() in IMAGE_EXTS and not _meta_path(img).exists()
    ]
    total = len(images)
    runs = []
    skipped = made = 0
    for idx, img in enumerate(images, 1):
        try:
            t0 = time.perf_counter()
            summary = _summarise_image(img)
            elapsed = time.perf_counter() - t0
            runs.append([idx, elapsed])
            meta = _meta_path(img)
            meta.write_text(summary + "\n", encoding='utf-8')
            made += 1
            print(f"
⟹  {img.relative_to(root)}")
            print(textwrap.fill(summary, width=88))
            print(f"— saved to {meta.name} —")
            avg = sum(r[1] for r in runs) / len(runs)
            remaining = total - idx
            percent = idx / total * 100 if total else 100
            eta = remaining * avg
            print(f"{percent:.1f}% complete, ~{format_duration(eta)} remaining")
        except Exception as e:
            skipped += 1
            print(f"[skip] {img.relative_to(root)} → {e}")
    print(f"
✓ Done. {made} files written, {skipped} skipped after retries.")
    return runs

create_all_summaries()



✓ Done. 0 files written, 0 skipped after retries.


# Generate missing gemma3-27b-vision.txt files

In [7]:
import base64, requests, textwrap, time
from pathlib import Path
from requests.exceptions import RequestException

MODEL_NAME   = "gemma3:27b"
OLLAMA_URL   = "http://docker-ai:11434/api/generate"
PROMPT       = (
    "Describe this in several sections with headings on the following topics (only if each topic applies): Visual Description, Foucauldian Genealogical Discourse Analysis, Critical Theory, Marxist Conflict Theory, Postmodernism, Queer Feminist Intersectional Analysis."
)
IMAGE_EXTS   = {".jpeg", ".jpg", ".png"}

MAX_RETRIES  = 3        # total attempts per image
INITIAL_WAIT = 5        # seconds before first retry (doubles each time)
def format_duration(seconds):
    seconds = int(seconds)
    mins, sec = divmod(seconds, 60)
    hrs, mins = divmod(mins, 60)
    days, hrs = divmod(hrs, 24)
    parts = []
    if days: parts.append(f'{days}d')
    if hrs: parts.append(f'{hrs}h')
    if mins: parts.append(f'{mins}m')
    if sec or not parts: parts.append(f'{sec}s')
    return ' '.join(parts)


def _meta_path(img: Path) -> Path:
    return img.with_name(f"{img.name}.gemma3-27b-vision.txt")

def _summarise_image(img: Path) -> str:
    """Call Ollama with retries; raise after MAX_RETRIES failures."""
    img_b64  = base64.b64encode(img.read_bytes()).decode()
    payload  = {
        "model": MODEL_NAME,
        "prompt": PROMPT,
        "stream": False,
        "images": [img_b64],
    }
    for attempt in range(1, MAX_RETRIES + 1):
        try:
            r = requests.post(OLLAMA_URL, json=payload, timeout=300)
            r.raise_for_status()
            summary = r.json().get('response', '').strip()
            if not summary:
                raise ValueError("API returned empty 'response'")
            return summary
        except (RequestException, ValueError) as err:
            wait = INITIAL_WAIT * 2 ** (attempt - 1)
            print(f"[{img.name}] attempt {attempt}/{MAX_RETRIES} failed: {err}")
            if attempt < MAX_RETRIES:
                print(f"   → retrying in {wait}s …")
                time.sleep(wait)
            else:
                raise  # bubbled up to main loop

def create_all_summaries(root=MEMES_ROOT):
    root = Path(root)
    images = [
        img for img in root.rglob('*')
        if img.suffix.lower() in IMAGE_EXTS and not _meta_path(img).exists()
    ]
    total = len(images)
    runs = []
    skipped = made = 0
    for idx, img in enumerate(images, 1):
        try:
            t0 = time.perf_counter()
            summary = _summarise_image(img)
            elapsed = time.perf_counter() - t0
            runs.append([idx, elapsed])
            meta = _meta_path(img)
            meta.write_text(summary + "\n", encoding='utf-8')
            made += 1
            print(f"
⟹  {img.relative_to(root)}")
            print(textwrap.fill(summary, width=88))
            print(f"— saved to {meta.name} —")
            avg = sum(r[1] for r in runs) / len(runs)
            remaining = total - idx
            percent = idx / total * 100 if total else 100
            eta = remaining * avg
            print(f"{percent:.1f}% complete, ~{format_duration(eta)} remaining")
        except Exception as e:
            skipped += 1
            print(f"[skip] {img.relative_to(root)} → {e}")
    print(f"
✓ Done. {made} files written, {skipped} skipped after retries.")
    return runs

create_all_summaries()



⟹  AAVE\292777611_5475070732552171_7872519000336240751_n.jpg
Okay, let's break down this Tumblr post using the requested theoretical lenses. This is
a fascinating example of online discourse engaging with issues of language, race, and
power.   **Please note:** This analysis will be complex and may involve interpretation.
I'll strive to be thorough, but theory application is often open to debate.  ---  **1.
Visual Description**  The Tumblr post features:  *   **A Screenshot of a Tumblr Ask:** A
user asks "why do black people use you in the wrong context? such as 'you ugly' instead
of 'you're ugly'. I know you guys can differentiate, it's a nuisance."  *   **A
Response:** The response is simply the phrase "you a bitch." *   **A Follow-up
Explanation:** A longer text block from a user named “rsbenedict” dissects the
linguistic phenomenon at play and frames it within the context of AAVE (African American
Vernacular English). *   **An Image:** A black and white picture of a man with a bear

# Generate a markdown file for each meme

In [None]:
from pathlib import Path

META_ORDER = ["llama-3.2-vision", "first-seen", "tesseract"]
META_SUFFIXES = {
    "llama-3.2-vision": ".llama3.2-vision.txt",
    "tesseract": ".txt",
    "first-seen": ".first-seen.txt",
}

def create_markdown_files(root=MEMES_ROOT):
    root = Path(root)
    for meme in root.rglob('*'):
        if meme.suffix.lower() not in TRACKED_EXTS or not meme.is_file():
            continue

        md_file = meme.with_name(meme.name + '.md')
        lines = [
            '---\n',
            'layout: meme\n',
            f'title: {meme.name}\n',
            f'category: {meme.parent.name}\n',
            '---\n\n',
        ]

        lines.append(f'<div markdown="0">')
        if meme.suffix.lower() in IMAGE_EXTS:
            lines.append(f'<a href="{meme.name}"><img class="photo" src="{meme.name}" /></a>\n\n')
        else:
            lines.append(f'[Download {meme.name}]({meme.name})\n')

        for meta in META_ORDER:
            
            meta_path = meme.with_name(meme.name + META_SUFFIXES[meta])
            
            if meta_path.exists():
                content = read_text_multi(meta_path).strip()
                lines.append(f'<h2>{meta}</h2>\n')

                if meta == "gemma3-27b-vision":
                    lines.append(f'<p title="Gemma3-27B is a really good model.">{escape_entities(content)}</p>\n\n')
                if meta == "llama-3.2-vision":
                    lines.append(f'<p title="Llama-3.2-11B is a really good model that probably gets the visual details right but doesn\'t understand literary or media references, and often fails to accurately represent the physical arrangement of objects and the implied relationships between the objects.">{escape_entities(content)}</p>\n\n')
                elif meta == "first-seen":
                    lines.append(f'<p title="Because Git doesn\'t preserve file modification times, this metadata file contains the file\'s modification time when it was added to the library.">{escape_entities(content)}</p>\n\n')
                elif meta == "tesseract":
                    lines.append(f'<p title="Tesseract is often terrible and just gives a lot of nonsense characters, but it used to be the state of the art, and usually it is better at correctly representing text than llama-3.2-vision-11b.">{escape_entities(content)}</p>\n\n')


                

        lines.append('</div>\n\n')
        md_file.write_text(''.join(lines), encoding='utf-8')

create_markdown_files()


# Generate new markdown index files for each category directory

In [None]:
def create_category_indexes(root=MEMES_ROOT):
    root = Path(root)
    for category in root.iterdir():
        if not category.is_dir():
            continue
        index_md = category / 'index.md'
        entries = []
        for meme in category.iterdir():
            if meme.suffix.lower() not in TRACKED_EXTS or not meme.is_file():
                continue
            fs_path = meme.with_name(meme.name + META_SUFFIXES['first-seen'])
            llama_path = meme.with_name(meme.name + META_SUFFIXES['llama-3.2-vision'])
            gemma_path = meme.with_name(meme.name + META_SUFFIXES['gemma3-27b-vision'])
            first_seen = read_text_multi(fs_path).strip() if fs_path.exists() else ''
            llama = read_text_multi(llama_path).strip() if llama_path.exists() else ''
            gemma = read_text_multi(gemma_path).strip() if gemma_path.exists() else ''
            html = meme.name + '.html'
            cat = category.name
            entries.append((first_seen, meme.name, html, llama, cat, gemma))
        entries.sort(key=lambda e: e[0], reverse=True)
        lines = [
            '---\n',
            'layout: category\n',
            f'title: "{category.name}"\n',
            f'category: {category.name}\n',
            '---\n\n',
        ]
        for fs, img, html, llama, cat in entries:
            llama = escape_entities(llama)
            gemma = escape_entities(gemma)
            lines.append(f'<div markdown="0">')
            lines.append(f'<div class="card mb-4" data-category="{cat}" data-pubdate="{fs}">')
            lines.append(f'  <a href="{html}"><img class="card-img-top" loading="lazy" src="{img}" alt="{llama}" /></a>')
            lines.append('  <div class="card-body">')
            lines.append(f'<p><a href=\"memes/{cat}/index.html\">{cat}</a></p>\n')
            if fs:
                lines.append(f'    <p class="card-text text-muted small firstseen">{fs}</p>')
            if gemma:
                lines.append(f'    <p class="card-text text-muted small gemma-output">{gemma}</p>')
            if llama:
                lines.append(f'    <p class="card-text text-muted small llama-output">{llama}</p>')
            lines.append('  </div>')
            lines.append('</div>\n\n')
        index_md.write_text(''.join(lines), encoding='utf-8')

create_category_indexes()

# Generate main index markdown

In [None]:
def create_main_index(root=MEMES_ROOT, out_file=Path("index.md")):
    root = Path(root)
    entries = []
    for category in root.iterdir():
        if not category.is_dir():
            continue
        for meme in category.iterdir():
            if meme.suffix.lower() not in TRACKED_EXTS or not meme.is_file():
                continue
            fs_path = meme.with_name(meme.name + META_SUFFIXES['first-seen'])
            llama_path = meme.with_name(meme.name + META_SUFFIXES['llama-3.2-vision'])
            gemma_path = meme.with_name(meme.name + META_SUFFIXES['gemma3-27b-vision'])
            first_seen = read_text_multi(fs_path).strip() if fs_path.exists() else ''
            llama = read_text_multi(llama_path).strip() if llama_path.exists() else ''
            gemma = read_text_multi(llama_path).strip() if gemma_path.exists() else ''            
            html = f"memes/{category.name}/{meme.name}.html"
            img  = f"memes/{category.name}/{meme.name}"
            cat = category.name
            entries.append((first_seen, category.name, img, html, llama, cat, gemma))
    entries.sort(key=lambda e: e[0], reverse=True)
    lines = [
        '---\n',
        'layout: homepage\n',
        'title: "memes.cjtrowbridge.com"\n',
        '---\n\n',
    ]
    for fs, cat, img, html, llama, cat in entries:
        llama = escape_entities(llama)
        lines.append(f'<div markdown="0">')
        lines.append(f'<div class="card mb-4" data-category="{cat}" data-pubdate="{fs}">')
        lines.append(f'  <a href="{html}"><img class="card-img-top" loading="lazy" src="{img}" alt="{llama}" /></a>')
        lines.append('  <div class="card-body">')
        lines.append(f'<p><a href=\"memes/{cat}/index.html\">{cat}</a></p>\n')
        if fs:
            lines.append(f'    <p class="card-text text-muted small firstseen">{fs}</p>')
        if gemma:
            lines.append(f'    <p class="card-text text-muted small gemma-output"><b>Gemma3-27B:</b> {gemma}</p>')
        if llama:
            lines.append(f'    <p class="card-text text-muted small llama-output"><b>Llama-3.2-11B:</b> {llama}</p>')
        lines.append('  </div>')
        lines.append('</div>\n\n')
    Path(out_file).write_text(''.join(lines), encoding='utf-8')

create_main_index()


# Build like jeckyll

In [None]:
import sys, subprocess, importlib, textwrap
from pathlib import Path

REPO_ROOT = Path.cwd()
MD_EXT = ".md"
HTML_EXT = ".html"

PKGS = {"python-frontmatter": "frontmatter",
        "markdown": "markdown",
        "python-liquid": "liquid"}

def _pip_install(pkg):
    print(f"▶ installing {pkg} …")
    subprocess.check_call([sys.executable, "-m", "pip", "install", "--quiet", pkg])

for pip_name, mod_name in PKGS.items():
    try:
        importlib.import_module(mod_name)
    except ModuleNotFoundError:
        _pip_install(pip_name)

import frontmatter, markdown
try:
    import liquid
    HAVE_LIQUID = True
    env = liquid.Environment()
except ModuleNotFoundError:
    HAVE_LIQUID = False

LAYOUT_DIR = REPO_ROOT / "_layout"

DEFAULT_LAYOUT = LAYOUT_DIR / "default.html"
DEFAULT_TEMPLATE = DEFAULT_LAYOUT.read_text(encoding="utf-8")


def render_html(md_path: Path) -> str:
    post = frontmatter.load(md_path)
    md_html = markdown.markdown(
        post.content,
        extensions=["extra", "codehilite", "toc", "tables", "sane_lists"],
    )
    if HAVE_LIQUID:
        md_html = env.from_string(md_html).render(**post.metadata)

    layout_name = post.get("layout") or "default"
    layout_path = LAYOUT_DIR / f"{layout_name}.html"
    if layout_path.exists():
        template = layout_path.read_text(encoding="utf-8")
    else:
        template = DEFAULT_TEMPLATE

    title = post.get("title") or md_path.stem
    category = post.get("category") or "uncategorized"
    html = template.replace("{title}", str(title))
    html = html.replace("{content}", md_html)
    html = html.replace("{category}", str(category))
    return html

converted, skipped = 0, 0
for md_file in REPO_ROOT.rglob(f"*{MD_EXT}"):
    if md_file.parts[0].startswith((".venv", ".git", ".ipynb_checkpoints", "_site")):
        continue
    html_path = md_file.with_suffix(HTML_EXT)
    if html_path.exists() and html_path.stat().st_mtime >= md_file.stat().st_mtime:
        skipped += 1
        continue
    html_path.write_text(render_html(md_file), encoding="utf-8")
    converted += 1
    print("✓", html_path.relative_to(REPO_ROOT))
print(f"\n🎉  Done. {converted} file(s) converted, {skipped} up-to-date.")
