# Memes Library Builder

This library contains tens of thousands of memes organized in hundreds of folders by topic. This notebook builds the master json file which contains a list of all the topics, all the memes in each topic, and a list of any metadata associated with each meme.


In [1]:
## Import Libraries

import os
import json
from glob import glob
from pathlib import Path


In [2]:
## Define Constants

MEMES_ROOT = Path('memes')


# Create missing first-seen files

Iterate recursively through all the subdirectories of the ./memes folder and for any file with one of the following extensions, check if it already has a file with the same name in the same directory but appended with first-seen.txt. So for example meme.jpg meansthere should also be a file called meme.jpg.first-seen.txt  

If not, create a new file containing the file modification time as a python datetime. This file should have the same name but appended with first-seen.txt. So for example meme.jpg means we need to create a file called meme.jpg.first-seen.txt containing the python datetime of the modification time of the meme.jpg file.  

- .gif
- .jfif
- .jpeg
- .jpg
- .mp4
- .png
- .svg
- .webp




In [3]:
from pathlib import Path
from datetime import datetime, timezone


TRACKED_EXTS = {
    ".gif", ".jfif", ".jpeg", ".jpg",
    ".mp4", ".png", ".svg", ".webp",
}

def ensure_first_seen_files(root: Path, exts: set[str]) -> dict[str, int]:
    created = 0
    skipped = 0

    for file in root.rglob("*"):
        if file.suffix.lower() not in exts or not file.is_file():
            continue

        meta_path = file.with_name(file.name + ".first-seen.txt")

        if meta_path.exists():
            skipped += 1
            continue

        # ▶ grab mtime and convert to UTC datetime
        mod_time = datetime.fromtimestamp(file.stat().st_mtime,
                                          tz=timezone.utc)
        # ▶ write ISO-8601 string
        meta_path.write_text(mod_time.isoformat())

        created += 1

    return {"created": created, "skipped": skipped}

summary = ensure_first_seen_files(MEMES_ROOT, TRACKED_EXTS)
print(f"First-seen files created: {summary['created']}")
print(f"Already present / skipped : {summary['skipped']}")

First-seen files created: 0
Already present / skipped : 8404


# Build the master memes.json file

The memes are organized like so;
/memes/Topic 1
/memes/Topic 2

Memes can be images or videos. Assume all the common file extensions will be present. For each meme, a number of metadata files may be present. These should be included if present.

For example;
memefilename.jpg
memefilename.jpg.txt <- Canonical tesseract-OCR of the meme. Might be nonsense. Probably we won't need this but it's there, so lets include it in the json file.
memefilename.jpg.llama-3.2-vision.txt <- High quality transformer analysis of the image, containing detailed explanation of the visual elements of the image, including any text, but probably lacking an awareness of any social context or any relationship to current events.

Example output:

'Topic 1' => {
    1 => {
        'file' => 'memes/topic/filename.jpg',
        'filemtime' => 'date the file was last modified',
        'metadata' => {
            'tesseract-ocr' => 'memes/topic/filename.jpg.txt',
            'llama-3.2-vision' => 'memes/topic/filename.jpg.llama-3.2-vision.txt'
        }
    }
}

The list of memes in each topic must be ordered by filemtime descending, such that the most recently added item is number 1. 

In [4]:
# Helper functions

def get_file_timestamp(file: Path) -> datetime:
    
    file = Path(file)
    
    meta_path = file.with_name(file.name + ".first-seen.txt")

    if meta_path.exists():
        # Read the timestamp from first-seen.txt
        mod_time_str = meta_path.read_text().strip()
        return mod_time_str
    else:
        # Fallback to current modification time
        return datetime.fromtimestamp(file.stat().st_mtime, tz=timezone.utc).isoformat()
    
def get_topics():
    """Return list of topic folders in the memes directory"""
    return [p.name for p in MEMES_ROOT.iterdir() if p.is_dir()]

def process_topic(topic: str) -> dict[int, dict]:
    topic_path = Path(MEMES_ROOT) / topic
    memes: list[dict] = []

    for meme_file in topic_path.iterdir():
        if meme_file.suffix.lower() not in TRACKED_EXTS or not meme_file.is_file():
            continue

        memes.append({
            "file": str(meme_file),
            "filemtime": get_file_timestamp(meme_file),   # dt obj
            "metadata": {
                k: str(meme_file.with_name(meme_file.name + suffix))
                for k, suffix in {
                    "tesseract-ocr": ".txt",
                    "llama-3.2-vision": ".llama-3.2-vision.txt"
                }.items()
                if meme_file.with_name(meme_file.name + suffix).exists()
            }
        })

    # newest first
    memes.sort(key=lambda m: m["filemtime"], reverse=True)

    # re-index so 1 == newest
    return {i: _serialise(meme) for i, meme in enumerate(memes, 1)}

def _serialise(meme: dict) -> dict:
    """Convert dt → iso-string without microseconds so JSON dump works."""
    meme = meme.copy()
    meme["filemtime"] = meme["filemtime"]
    return meme

def build_master_json():
    """Build the master JSON file with all topics and all memes"""
    master_dict = {}

    for topic in get_topics():
        master_dict[topic] = process_topic(topic)

    # Sort each topic's memes by filemtime descending
    for topic, memes in master_dict.items():
        sorted_memes = dict(sorted(memes.items(), key=lambda item: item[1]['filemtime'], reverse=True))
        master_dict[topic] = sorted_memes

    return master_dict

master_json_data = build_master_json()


## Save JSON File

with open('memes.json', 'w') as json_file:
    json.dump(master_json_data, json_file, indent=4)


# Generate missing llama-3.2-vision.txt files



In [5]:
import base64, requests, textwrap, time
from pathlib import Path
from requests.exceptions import RequestException

MODEL_NAME   = "llama3.2-vision:11b"
OLLAMA_URL   = "http://docker-ai:11434/api/generate"
PROMPT       = (
    "In 2-3 sentences, describe this meme for someone who cannot see it. "
    "Include any text that appears in the image."
)
IMAGE_EXTS   = {".gif", ".jfif", ".jpeg", ".jpg", ".png", ".svg", ".webp"}

MAX_RETRIES  = 3        # total attempts per image
INITIAL_WAIT = 5        # seconds before first retry (doubles each time)

def _meta_path(img: Path) -> Path:
    return img.with_name(f"{img.name}.llama3.2-vision.txt")

def _summarise_image(img: Path) -> str:
    """Call Ollama with retries; raise after MAX_RETRIES failures."""
    img_b64  = base64.b64encode(img.read_bytes()).decode()
    payload  = {
        "model": MODEL_NAME,
        "prompt": PROMPT,
        "stream": False,
        "images": [img_b64],
    }

    for attempt in range(1, MAX_RETRIES + 1):
        try:
            r = requests.post(OLLAMA_URL, json=payload, timeout=300)
            r.raise_for_status()
            summary = r.json().get("response", "").strip()
            if not summary:
                raise ValueError("API returned empty 'response'")
            return summary

        except (RequestException, ValueError) as err:
            wait = INITIAL_WAIT * 2 ** (attempt - 1)
            print(f"[{img.name}] attempt {attempt}/{MAX_RETRIES} failed: {err}")
            if attempt < MAX_RETRIES:
                print(f"   → retrying in {wait}s …")
                time.sleep(wait)
            else:
                raise  # bubbled up to main loop

def create_all_summaries(root=MEMES_ROOT):
    skipped, made = 0, 0
    for img in Path(root).rglob("*"):
        if img.suffix.lower() not in IMAGE_EXTS or _meta_path(img).exists():
            continue

        try:
            summary = _summarise_image(img)
            meta    = _meta_path(img)
            meta.write_text(summary + "\n", encoding="utf-8")
            made += 1

            print(f"\n⟹  {img.relative_to(root)}")
            print(textwrap.fill(summary, width=88))
            print(f"— saved to {meta.name} —\n")

        except Exception as e:
            skipped += 1
            print(f"[skip] {img.relative_to(root)} → {e}")

    print(f"\n✓ Done. {made} files written, {skipped} skipped after retries.")

# create_all_summaries()

# Generate a markdown file for each meme

In [7]:
from pathlib import Path

META_ORDER = ["llama-3.2-vision", "tesseract", "first-seen"]
META_SUFFIXES = {
    "llama-3.2-vision": ".llama-3.2-vision.txt",
    "tesseract": ".txt",
    "first-seen": ".first-seen.txt",
}

def create_markdown_files(root=MEMES_ROOT):
    root = Path(root)
    for meme in root.rglob('*'):
        if meme.suffix.lower() not in TRACKED_EXTS or not meme.is_file():
            continue

        md_file = meme.with_name(meme.name + '.md')
        lines = [
            '---',
            'layout: default',
            f'title: {meme.name}',
            '---',
        ]

        if meme.suffix.lower() in IMAGE_EXTS:
            lines.append(f'<a href="{meme.name}"><img class="photo" src="{meme.name}" /></a>')
        else:
            lines.append(f'[Download {meme.name}]({meme.name})')

        for meta in META_ORDER:
            meta_path = meme.with_name(meme.name + META_SUFFIXES[meta])
            if meta_path.exists():
                content = meta_path.read_text().strip()
                lines.append(f'<h2>{meta}</h2>')
                lines.append(f'<p>{content}</p>')

        md_file.write_text(''.join(lines), encoding='utf-8')

create_markdown_files()


UnicodeDecodeError: 'charmap' codec can't decode byte 0x9d in position 111: character maps to <undefined>

# Generate new markdown files for each category directory