In [1]:
# Section 0 - Reset + Pin exact versions
import sys, os, importlib, shutil, pathlib

# Use this interpreter's pip 
pip = f'"{sys.executable}" -m pip'

# 1) Cleanly remove any preloaded modules from memory
for m in list(sys.modules):
    if m.startswith("transformers") or m.startswith("tokenizers"):
        del sys.modules[m]

# 2) Uninstall then install the pinned versions
print("Uninstalling transformers & tokenizers (if present)...")
!{pip} uninstall -y transformers tokenizers >/dev/null

print("Installing transformers==4.49.0 and tokenizers==0.21.4 ...")
!{pip} install --no-cache-dir "transformers==4.49.0" "tokenizers==0.21.4"

# 3) Verify versions
import transformers, tokenizers
print("transformers:", transformers.__version__)
print("tokenizers:  ", tokenizers.__version__)

print("Installing einops + timm...")
!{pip} install --no-cache-dir einops timm

Uninstalling transformers & tokenizers (if present)...
Installing transformers==4.49.0 and tokenizers==0.21.4 ...


The system cannot find the path specified.




  from .autonotebook import tqdm as notebook_tqdm


transformers: 4.49.0
tokenizers:   0.21.4
Installing einops + timm...


In [2]:
# Section 1 - Variables

# --- Model / Task ---
MODEL_ID: str           = "microsoft/Florence-2-base"
TASK_CAPTION: str       = "<MORE_DETAILED_CAPTION>"                 # or just <CAPTION>
IMAGE_PLACEHOLDER: str  = "<image>"                                 # REQUIRED token for Florence-2 prompts
REVISION                = "main"

# --- Inference knobs ---
MAX_NEW_TOKENS: int     = 128
NUM_BEAMS: int          = 3
RESIZE_LONG_SIDE: int   = 512

# --- Device / dtype preferences ---
PREFERRED_DTYPE: str    = "float32"                             
FORCE_CPU: bool         = False

# --- Image sources ---
TRAIN_DIR: str          = r"F:\deepfashion2\images\train\train\image"
N_SAMPLES: int          = 15

# --- Output / logging ---
SAVE_JSON_PATH: str     = "florence_preview_output.json"
PRINT_DEBUG: bool       = True

In [3]:
# Section 2 - Imports & Version Checks

import os, sys, re, json, platform, importlib, math, warnings
from typing import List, Dict, Any, Tuple, Optional
from pathlib import Path

os.environ.setdefault("HF_HUB_DISABLE_SYMLINKS_WARNING", "1")
os.environ.setdefault("TRANSFORMERS_VERBOSITY", "error")
warnings.filterwarnings("ignore", category=UserWarning, module="huggingface_hub.file_download")

import numpy as np
from PIL import Image

import torch
import torchvision.transforms as T
import transformers
from transformers import AutoProcessor, AutoModelForCausalLM


import matplotlib.pyplot as plt

def _get_ver(name: str) -> Optional[str]:
    try:
        return importlib.import_module(name).__version__
    except Exception:
        return None

python_ver = sys.version.split()[0]
print("=== Runtime / Versions ===")
print(f"Python:         {python_ver}  ({platform.system()} {platform.release()})")
print(f"PyTorch:        {_get_ver('torch')}")
print(f"Transformers:   {_get_ver('transformers')}")
print(f"Pillow (PIL):   {_get_ver('PIL')}")
print(f"Matplotlib:     {_get_ver('matplotlib')}")
print(f"TorchVision:    {_get_ver('torchvision')}")


=== Runtime / Versions ===
Python:         3.10.18  (Windows 10)
PyTorch:        2.5.1+cu121
Transformers:   4.49.0
Pillow (PIL):   11.3.0
Matplotlib:     3.10.7
TorchVision:    0.20.1+cu121


In [4]:
# Section 3 - Device, DType, and Image Utilities

if FORCE_CPU:
    device = torch.device("cpu")
else:
    device = torch.device("cuda:0") if torch.cuda.is_available() else (
        torch.device("mps") if hasattr(torch.backends, "mps") and torch.backends.mps.is_available() else torch.device("cpu")
    )

# --- DType selection ---
def pick_dtype(pref: str, device: torch.device) -> torch.dtype:
    pref = pref.lower()
    if device.type == "cuda":
        # bf16 support is only on newer GPUs
        bf16_ok = hasattr(torch.cuda, "is_bf16_supported") and torch.cuda.is_bf16_supported()
        if pref == "bfloat16" and bf16_ok:
            return torch.bfloat16
        if pref in ("float16", "bfloat16") and not bf16_ok:
            return torch.float16  # good default on CUDA if bf16 not supported
        return torch.float16 if pref != "float32" else torch.float32
    elif device.type == "mps":
        # MPS generally prefers float32 for safety
        return torch.float32
    else:
        # CPU fallback
        return torch.float32

dtype = pick_dtype(PREFERRED_DTYPE, device)

print("=== Hardware ===")
print(f"Selected device: {device}")
if device.type == "cuda":
    print(f"GPU: {torch.cuda.get_device_name(0)}")
print(f"dtype: {dtype}")

# --- Image utilities ---
def load_image(path: str) -> Image.Image:
    """Load an image from disk as RGB."""
    img = Image.open(path).convert("RGB")
    return img

def resize_long_side(img: Image.Image, long_side: int = 512) -> Image.Image:
    """Resize the longer side of the image to `long_side` while maintaining aspect ratio."""
    w, h = img.size
    if max(w, h) <= long_side:
        return img
    if w >= h:
        new_w = long_side
        new_h = int(h * (long_side / w))
    else:
        new_h = long_side
        new_w = int(w * (long_side / h))
    return img.resize((new_w, new_h), Image.BICUBIC)

def discover_images(root: str, n: int = 3) -> List[str]:
    """
    Recursively find up to n image files under `root`.
    """
    candidates = []
    roots_to_try = [root]
    exts = {".jpg", ".jpeg", ".png", ".bmp", ".webp"}
    for r in roots_to_try:
        p = Path(r)
        if not p.exists():
            continue
        for fp in p.rglob("*"):
            if fp.suffix.lower() in exts and ("__MACOSX" not in str(fp)):
                candidates.append(str(fp))
        if len(candidates) >= n:
            break
    candidates = sorted(set(candidates))
    if len(candidates) < n:
        raise FileNotFoundError(
            f"Could not find {n} images under {root}. Found {len(candidates)}."
        )
    return candidates[:n]

=== Hardware ===
Selected device: cuda:0
GPU: NVIDIA GeForce RTX 2060
dtype: torch.float32


In [5]:
# Section 4 - Load Florence-2 (Processor + Model)

processor = AutoProcessor.from_pretrained(
    MODEL_ID,
    revision=REVISION,
    trust_remote_code=True,
)
load_dtype = dtype if (hasattr(torch, "cuda") and device.type != "cpu") else torch.float32
model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    revision=REVISION,
    torch_dtype=load_dtype,                 
    low_cpu_mem_usage=True,
    trust_remote_code=True,
)
model.to(device)
model.eval()

print("=== Florence-2 ===")
print(f"Model:      {MODEL_ID}")
print(f"On device:  {next(model.parameters()).device}")


=== Florence-2 ===
Model:      microsoft/Florence-2-base
On device:  cuda:0


In [6]:
# Section 5 - Garment-Focused Ontology + Normalizers + Caption -> Tags (types/colors/patterns)

import re
from typing import Dict, List, Optional, Tuple

# --- Ontology (canonical forms on the right) ---
TYPE_ALIASES = {
    # tops
    "tee": "t-shirt", "tshirt": "t-shirt", "t-shirt": "t-shirt", "t shirt": "t-shirt",
    "tee shirt": "t-shirt", "t- shirt": "t-shirt", "top": "top",
    "blouse": "blouse", "shirt": "shirt", "polo": "polo shirt",
    "hoodie": "hoodie", "sweatshirt": "sweatshirt", "sweater": "sweater", "cardigan": "cardigan",
    "tank": "tank top", "tanktop": "tank top", "camisole": "camisole", "crop": "crop top",
    # outerwear
    "jacket": "jacket", "coat": "coat", "blazer": "blazer", "parka": "parka",
    "windbreaker": "windbreaker", "raincoat": "raincoat",
    # dresses/one-piece
    "dress": "dress", "jumpsuit": "jumpsuit", "romper": "romper",
    # bottoms
    "jeans": "jeans", "pants": "pants", "trousers": "trousers", "slacks": "trousers",
    "shorts": "shorts", "chinos": "pants", "leggings": "leggings", "skirt": "skirt",
    "sweatpants": "sweatpants", "joggers": "joggers",
    # footwear
    "sneakers": "sneakers", "tennis": "sneakers", "running": "sneakers", "shoes": "shoes",
    "boots": "boots", "ankle": "boots", "heels": "heels", "loafers": "loafers", "sandals": "sandals",
    # accessories
    "hat": "hat", "cap": "cap", "beanie": "beanie", "scarf": "scarf",
    "gloves": "gloves", "belt": "belt", "bag": "bag", "backpack": "backpack",
    # parts/trim 
    "hood": "hood", "collar": "collar", "sleeve": "sleeve"
}
FASHION_TYPES = set(TYPE_ALIASES.values())

COLOR_ALIASES = {
    "navy": "blue", "teal": "blue", "aqua": "blue", "cyan": "blue", "azure": "blue", "indigo": "blue",
    "maroon": "red", "burgundy": "red", "crimson": "red",
    "ivory": "white", "cream": "white", "off-white": "white", "off white": "white",
    "charcoal": "grey", "slate": "grey", "silver": "grey", "gray": "grey",
    "tan": "brown", "beige": "brown", "khaki": "brown", "camel": "brown",
    "gold": "yellow", "blonde": "yellow", "blond": "yellow",
    "violet": "purple", "magenta": "pink", "fuchsia": "pink", "mustard": "yellow"
}
BASIC_COLORS = {
    "black", "white", "grey", "blue", "red", "green", "yellow",
    "purple", "pink", "brown", "orange"
}

# Patterns normalized to canonical names
PATTERN_ALIASES = {
    "polka": "polka dot", "polka-dot": "polka dot", "polka dot": "polka dot",
    "plaid": "plaid", "tartan": "plaid",
    "striped": "striped", "stripes": "striped",
    "checks": "checkered", "checked": "checkered", "checkered": "checkered",
    "floral": "floral", "flowers": "floral",
    "graphic": "graphic", "logo": "logo", "print": "graphic",
    "camouflage": "camouflage", "camo": "camouflage",
    "leopard": "animal print", "zebra": "animal print", "animal": "animal print"
}
PATTERNS = set(PATTERN_ALIASES.values())

# --- Heuristics for focusing on garment-related text ---
CLOTHING_HINTS = set(TYPE_ALIASES.keys()) | FASHION_TYPES
WEAR_VERBS = {"wear", "wears", "wearing", "wore", "dressed", "dressing", "has on"}

NONCLOTHING_HINTS = {
    "bed","blanket","pillow","wall","floor","sofa","couch","curtain","bracelet","earring","necklace",
    "hair","skin","makeup","room","kitchen","bedroom","living","bathroom","mirror","background",
    "hand","hands","arm","arms","wrist","waist","leg","legs","face","faces","eye","eyes"
}

# --- Normalizers ---
def normalize_type(word: str) -> Optional[str]:
    w = word.lower().strip().replace("-", " ")
    return TYPE_ALIASES.get(w)

def _has_wear_verb(s: str) -> bool:
    return bool(re.search(r"\b(wear|wears|wearing|wore|dressed|dressing|has on)\b", s))

def _adjacent_to(tokens, idx, vocab: set) -> bool:
    """True if token at idx has a left/right neighbor inside `vocab`."""
    left  = tokens[idx-1][0] if idx - 1 >= 0 else None
    right = tokens[idx+1][0] if idx + 1 < len(tokens) else None
    return (left in vocab) or (right in vocab)

def normalize_color(word: str) -> Optional[str]:
    w = word.lower().strip().replace("-", " ")
    w = COLOR_ALIASES.get(w, w)
    return w if w in BASIC_COLORS else None

def normalize_pattern(word: str) -> Optional[str]:
    w = word.lower().strip()
    w = PATTERN_ALIASES.get(w, w)
    return w if w in PATTERNS else None

# --- Tokenization helpers ---
def _tokens_with_spans(text: str) -> List[Tuple[str, int, int]]:
    return [(m.group(0).lower(), m.start(), m.end())
            for m in re.finditer(r"[a-zA-Z]+(?:-[a-zA-Z]+)?", text)]

def _sentence_spans(text: str) -> List[Tuple[int, int]]:
    spans, start = [], 0
    for m in re.finditer(r"(?<=[.!?])\s+", text):
        end = m.start()
        spans.append((start, end))
        start = m.end()
    spans.append((start, len(text)))
    return spans

# --- Main: garment-focused caption → tags ---
def extract_tags_from_caption(caption: str,
                              proximity_window: int = 6,
                              max_colors: int = 4) -> Dict[str, List[str]]:
    """
    Extract types/colors/patterns from a caption, but only from:
      1) sentences that contain clothing words or wear-verbs, and
      2) tokens within a proximity window around clothing words.
    Also ignores obvious background nouns (bed, bracelet, wall, etc.).
    """
    text = caption.strip()
    if not text:
        return {"type": [], "color": [], "pattern": []}

    tokens = _tokens_with_spans(text)            # [(tok, a, b), ...]
    sents = _sentence_spans(text)                # [(a, b), ...]
    text_l = text.lower()

    # Mark sentences that talk about clothing
    clothing_sent_idx = set()
    for i, (a, b) in enumerate(sents):
        s = text_l[a:b]
        if _has_wear_verb(s) or any(w in s for w in CLOTHING_HINTS):
            clothing_sent_idx.add(i)

    # Collect token indices in relevant zones
    relevant_idx = set()

    # (A) proximity to clothing words
    clothing_token_positions = [i for i, (tok, _, _) in enumerate(tokens) if tok in CLOTHING_HINTS]
    for pos in clothing_token_positions:
        lo = max(0, pos - proximity_window)
        hi = min(len(tokens), pos + proximity_window + 1)
        for j in range(lo, hi):
            relevant_idx.add(j)

    # (B) any token inside a clothing-marked sentence
    for si in clothing_sent_idx:
        sa, sb = sents[si]
        for j, (_, a, b) in enumerate(tokens):
            if a >= sa and b <= sb:
                relevant_idx.add(j)

    # Nothing relevant? bail early
    if not relevant_idx:
        return {"type": [], "color": [], "pattern": []}

    types, colors, patterns = set(), set(), set()

    # Scan only relevant tokens, skipping obvious background words
    for idx, (tok, _, _) in enumerate(tokens):
        if idx not in relevant_idx:
            continue
        if tok in NONCLOTHING_HINTS:
            continue

        # types
        t = normalize_type(tok)
        if t:
            # canonicalize tee/t-shirt etc.
            types.add(t)

        # single-token patterns
        p = normalize_pattern(tok)
        if p:
            if _adjacent_to(tokens, idx, NONCLOTHING_HINTS):   # e.g., "plaid blanket"
                continue
            patterns.add(p)

        # colors
        c = normalize_color(tok)
        if c:
            if _adjacent_to(tokens, idx, NONCLOTHING_HINTS):   # e.g., "brown hair", "blue blanket"
                continue
            colors.add(c)

    # multi-word patterns (adjacency-aware)
    rel_tokens = [tokens[i] for i in sorted(relevant_idx)]
    rel_words  = [t[0] for t in rel_tokens]

    def _phrase_present_not_adjacent_to_nonclothing(words: List[str], phrase: str) -> bool:
        parts = phrase.split()
        L = len(parts)
        for i in range(len(words) - L + 1):
            if words[i:i+L] == parts:
                left  = words[i-1] if i-1 >= 0 else None
                right = words[i+L] if i+L < len(words) else None
                if (left in NONCLOTHING_HINTS) or (right in NONCLOTHING_HINTS):
                    continue  # e.g., "plaid [blanket]"
                return True
        return False

    for pat_phrase in ["polka dot","animal print","checkered","striped","floral","camouflage","graphic","logo","plaid"]:
        if _phrase_present_not_adjacent_to_nonclothing(rel_words, pat_phrase):
            patterns.add(normalize_pattern(pat_phrase))

    # Heuristic: printed/text/letters near clothing ⇒ treat as logo/graphic
    if re.search(r"\b(logo|graphic|print|printed|text|letters)\b", " ".join(rel_words)):
        patterns.add("logo")
    
    # Post-process: collapse ambiguous types (e.g., "top" + "t-shirt" -> keep t-shirt)
    if "t-shirt" in types and "top" in types:
        types.discard("top")

    # Limit excessive colors (long captions can still mention many)
    if len(colors) > max_colors:
        colors = set(list(colors)[:max_colors])  # preserve arbitrary but bounded count

    return {
        "type":    sorted(types),
        "color":   sorted(colors),
        "pattern": sorted(p for p in patterns if p),
    }


In [7]:
# Section 6 - Florence-2 Inference Helpers (Caption → Tags) [FINAL PATCH]

@torch.inference_mode()
def florence_generate_caption(
    img: Image.Image,
    task_token: str = TASK_CAPTION,         
    max_new_tokens: int = MAX_NEW_TOKENS,
    num_beams: int = NUM_BEAMS
) -> str:
    # Prompt must be the task token only; processor will expand it.
    prompt = task_token

    # Bound compute
    img_resized = resize_long_side(img, RESIZE_LONG_SIDE)

    # >>> IMPORTANT: pass images as a list (batch of 1) <<<
    inputs = processor(
        text=prompt,
        images=[img_resized],               # <- list, not a single PIL image
        return_tensors="pt"
    )

    # Align device + dtype with model (floats -> model dtype; ints -> moved only)
    model_dtype = next(model.parameters()).dtype
    aligned = {}
    for k, v in inputs.items():
        if torch.is_tensor(v):
            aligned[k] = v.to(device=device, dtype=model_dtype) if torch.is_floating_point(v) else v.to(device)
        else:
            aligned[k] = v

    # Generate caption
    outputs = model.generate(
        **aligned,
        max_new_tokens=max_new_tokens,
        num_beams=num_beams,
    )
    caption = processor.batch_decode(outputs, skip_special_tokens=True)[0]
    caption = caption.replace("<CAPTION>", "").replace("<MORE_DETAILED_CAPTION>", "").strip()
    return caption

def image_to_tags_and_caption(img: Image.Image):
    caption = florence_generate_caption(img, TASK_CAPTION)
    tags = extract_tags_from_caption(caption)
    return tags, caption


In [8]:
print(type(processor))                 # should show Florence2Processor
print(next(model.parameters()).dtype)  # model dtype

<class 'transformers_modules.microsoft.Florence-2-base.5ca5edf5bd017b9919c05d08aebef5e4c7ac3bac.processing_florence2.Florence2Processor'>
torch.float32


In [9]:
# Section 7 - Pick Training Images, Run, and Display

sample_paths = discover_images(TRAIN_DIR, n=N_SAMPLES)
if PRINT_DEBUG:
    print("=== Sample Images ===")
    for i, p in enumerate(sample_paths, 1):
        print(f"{i}. {p}")

results: List[Dict[str, Any]] = []

for idx, p in enumerate(sample_paths, 1):
    img = load_image(p)
    tags, caption = image_to_tags_and_caption(img)

    # record
    rec = {"index": idx, "path": p, "caption": caption, "tags": tags}
    results.append(rec)

    # show
    plt.figure()
    plt.imshow(img)
    plt.axis("off")
    plt.title(f"Image {idx}")
    plt.show()

    print(f"\n--- Image {idx} ---")
    print(f"Path:    {p}")
    print(f"Caption: {caption}")
    print(f"Tags:    {json.dumps(tags, ensure_ascii=False)}")

# optionally save
with open(SAVE_JSON_PATH, "w", encoding="utf-8") as f:
    json.dump(results, f, indent=2, ensure_ascii=False)
if PRINT_DEBUG:
    print(f"\nSaved results to: {SAVE_JSON_PATH}")


FileNotFoundError: Could not find 15 images under F:\deepfashion2\images\train\train\image. Found 0.