In [1]:
# --- Imports ---
import datetime as dt
import math
import os
import re
import sys
import time
import warnings
import json
import functools
from pathlib import Path

# Data Manipulation
import numpy as np
import pandas as pd

# Validation
from pydantic import BaseModel, Field, ValidationError

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display, Latex, Math

# Console Formatting
from colorama import Fore, Style

# Logging
from loguru import logger
import uuid

# Secrets Management
sys.path.append("/mnt/git/github/gabemcwilliams/common-components/security")
from vault_mgr import *

# API & Scraping Utilities
import requests
from requests.adapters import HTTPAdapter, Retry
from urllib.parse import urlparse, urlencode
import certifi
from bs4 import BeautifulSoup
from PIL import Image
from io import BytesIO


In [2]:
# Image storage directory
images_dir = '/mnt/mls/images'


In [3]:
vault = VaultManager(debug=True)
secrets = vault.read_secret(mount_point='api', path='pexels')




VAULT_ADDR         : [CONFIGURED]
VAULT_CLIENT_CERT  : [CONFIGURED]
VAULT_CLIENT_KEY   : [CONFIGURED]
VAULT_CACERT       : [CONFIGURED]
SSL_CERT_FILE      : [CONFIGURED]


Cert Preview: public.crt found
Key Preview: private.key found
 * [93mVault Client[0m is [32m[AUTHENTICATED][0m


In [4]:
def api_call(query_label: str | None = None, page_no: str | None = None, per_page: int = 80) -> dict:
    """
    Calls the Pexels API to retrieve image search results.

    Args:
        query_label (str | None): Search keyword (e.g., "plant").
        page_no (str | None): Page number of results to retrieve.
        per_page (int): Number of images per page (default is 80).

    Returns:
        dict: Parsed JSON response from the Pexels API containing metadata and image URLs.
    """

    headers = {'Authorization': secrets['api_key']}
    params = {"query": query_label, "per_page": per_page, "page": page_no}
    request_url = f'{secrets["base_uri"]}/v1/search'
    print(request_url)
    response = requests.get(request_url, verify=certifi.where(), headers=headers, params=params)
    return response.json()


In [6]:
from pathlib import Path

query_list = [
    "leaf veins macro",
    "succulent cluster",
    "tropical foliage pattern",
    "jungle canopy plant",
    "young sprout macro"
]

output_dir = Path("/mnt/mls/data/pexels_metadata")
output_dir.mkdir(parents=True, exist_ok=True)

for query_label in query_list:
    print(f"\n--- Starting query: '{query_label}' ---\n")
    page = 0
    max_pages = 80
    all_pages = []

    parquet_path = output_dir / f"{query_label.replace(' ', '_')}.parquet"

    while page < max_pages:
        page += 1
        try:
            c_dict = api_call(query_label=query_label, page_no=page)
            print(f"  Fetched Page {c_dict['page']} of {query_label}")
            df_current = pd.DataFrame(c_dict['photos'])

            if df_current.empty:
                print("  Empty page — stopping early.")
                break

            df_current['query'] = query_label
            df_current['page'] = page

            all_pages.append(df_current)

            # Save progress
            pd.concat(all_pages, ignore_index=True).to_parquet(parquet_path, index=False)
            print(f"  Saved page {page} to {parquet_path}")

        except Exception as e:
            print(f"  [ERROR] Failed on page {page} of '{query_label}': {e}")
            break

    print(f"--- Finished '{query_label}' with {page} pages ---\n")



--- Starting query: 'leaf veins macro' ---

https://api.pexels.com/v1/search
  Fetched Page 1 of leaf veins macro
  Saved page 1 to /mnt/mls/data/pexels_metadata/leaf_veins_macro.parquet
https://api.pexels.com/v1/search
  Fetched Page 2 of leaf veins macro
  Saved page 2 to /mnt/mls/data/pexels_metadata/leaf_veins_macro.parquet
https://api.pexels.com/v1/search
  Fetched Page 3 of leaf veins macro
  Saved page 3 to /mnt/mls/data/pexels_metadata/leaf_veins_macro.parquet
https://api.pexels.com/v1/search
  Fetched Page 4 of leaf veins macro
  Saved page 4 to /mnt/mls/data/pexels_metadata/leaf_veins_macro.parquet
https://api.pexels.com/v1/search
  Fetched Page 5 of leaf veins macro
  Saved page 5 to /mnt/mls/data/pexels_metadata/leaf_veins_macro.parquet
https://api.pexels.com/v1/search
  Fetched Page 6 of leaf veins macro
  Saved page 6 to /mnt/mls/data/pexels_metadata/leaf_veins_macro.parquet
https://api.pexels.com/v1/search
  Fetched Page 7 of leaf veins macro
  Saved page 7 to /mnt/mls/

In [7]:

parquet_dir = Path("/mnt/mls/data/pexels_metadata")
parquet_files = list(parquet_dir.glob("*.parquet"))

df_master = pd.concat([pd.read_parquet(f) for f in parquet_files], ignore_index=True)
print(f"Total combined records: {len(df_master)}")

Total combined records: 16799


In [8]:
df_master['original'] = df_master['src'].apply(lambda x: x['original'])


In [9]:
bad_phrases = set()

# === 1. Indoor Decor & Furniture Phrases ===
bad_phrases.update({
    "light bulb",
    "coffee table",
    "wall art",
    "interior design",
    "dining room",
    "living room",
    "bedroom",
    "home office",
    "window sill", "bathroom counter",
    "sofa chair",
    "table lamp",
    "picture frame",
    "picture wall",
    "flat lay"
})

# === 2. Garden, Patio, & Staging Items ===
bad_phrases.update({
    "potted plant",
    "hanging basket",
    "urban garden",
    "flower vase",
    "fruit basket",
    "wooden deck",
    "picnic table",
    "lawn ornament",
    "garden gnome"
})

# === 3. Miscellaneous Environmental Distractions ===
bad_phrases.update({
    "city park",
    "smart phone",
    "art installation",
    "power plant",
    "test tube",
    "tree bark"
})


In [10]:
bad_lemmas = set()

# === 1. People, Body Parts, Relationships, Gender ===
bad_lemmas.update({
    "man", "woman", "male", "female", "person", "client",
    "employee", "group", "volunteer", "gardener", "students",
    "child", "kid",
    "girl", "boy", "son", "daughter", "mom", "dad", "mother", "father",
    "bride", "groom", "couple", "romance", "intimate",
    "hand", "foot", "leg", "arm", "chest", "head", "eye", "nose", "ear", "mouth", "face",
    "young", "old", "pose", "photographer", "hombre", "nutritionist", "florist", "arrange",
    "businessman", "businesswoman"
})

# === 2. Animals & Insects ===
bad_lemmas.update({
    "cat", "dog", "butterfly", "bee", "caterpillar", "squirrel", "bird",
    "perch", "spider", "web", "insect", "fly", "furry", "snail", "slug",
    "honeybee", "beetle", "tick", "frog", "lynx", "chameleon", "bumblebee",
    "hummingbird", "hovers", "ladybug", "moth", "peacock", "snake", "cobra", "python"
                                                                             "playful", "obedient"
})

# === 3. Furniture, Decor, Indoor Items ===
bad_lemmas.update({
    "table", "bed", "couch", "chair", "pillow", "blanket", "mirror", "curtain", "carpet",
    "rug", "lamp", "ceiling", "floor", "tile", "window", "shelf", "frame", "decor",
    "design", "wall", "painting", "print", "art", "style", "bottle", "vase", "fashion",
    "portrait", "studio", "bedding", "journal", "library", "bookshelves", "card", "lipstick",
    "blazer", "supplements", "interior", "stock", "photo", "free", "sunglasses", "drink",
    "flatlay", "paper", "fabric", "candle", "arrangement", "pitcher", "potted", "bible",
    "hanging", "hang", "reading", "nook", "bouquet", "cabinet", "drawer"
})

# === 4. Office, Tech, Work Items ===
bad_lemmas.update({
    "book", "notebook", "pen", "pencil", "laptop", "smartphone", "keyboard",
    "computer", "desk", "office", "setup", "device", "tablet", "work", "room", "scissors",
    "business", "magazine", "finance", "money", "bill", "dollar", "teamwork", "team",
    "suit", "sneakers", "shoes", "network", "cable",
    "nutritious", "pants", "dress", "wireless", "headphone", "jacket"
})

# === 5. Containers, Kitchen, and Medical Props ===
bad_lemmas.update({
    "basket", "bucket", "pot", "planter", "terrarium", "mug", "crate", "box",
    "cup", "spoon", "fork", "plate", "dropper", "vial", "tool", "mortar", "medicine",
    "bowl", "board", "document", "sink", "can", "tube", "cocktail", "refresh",
    "jar", "slice", "teapot", "herb", "cell", "platter", "pan",
    "remedies", "ingredient"
})

# === 6. Abstract, Model, or Visual Noise ===
bad_lemmas.update({
    "closeup", "shadow", "reflection", "model", "figure", "stylish",
    "sculpture", "statue", "diversity", "mockup", "artistic", "composition", "word", "letter", "aerial",
    "silhouette", "microscopic",
})

# === 7. Food, Fruit, Culinary ===
bad_lemmas.update({
    "fruit", "produce", "food", "kitchen", "grape", "cherry", "cherries",
    "tomato", "nut", "root", "culinary", "wine", "strawberry", "ripe", "harvest", "mushroom", "fungus", "fungi", "hop",
    "pineapple", "berry", "pumpkin", "coconut", "hazelnut", "walnut", "penut", "artichoke", "juice", "juicy", "fresh",
    "sliced", "meat", "skewered", "grill", "egg", "bacon", "pasta"
})

# === 8. Landscape, Outdoors, Urban ===
bad_lemmas.update({
    "farm", "pasture", "agriculture", "farming", "tractor", "trailer",
    "combine", "barn", "gravel", "path", "bench", "fence", "porch",
    "balcony", "building", "road", "driveway", "street", "bridge", "valley",
    "skyline", "resort", "pool", "travel", "leisure", "walkway", "roof", "environment",
    "countryside", "country", "urban", "exterior", "home",
    "house", "scene", "pathway", "wooden", "wood", "sign", "footpath", "forest", "ocean", "mountains", "desert",
    "rustic", "river", "lake", "waterfall", "fountain", "pond", "logs", "firewood", "sea"
})

# === 9. Religious, Monumental, Public Architecture ===
bad_lemmas.update({
    "temple", "church", "mosque", "graveyard", "tombstone", "skeleton", "store", "storefront"
})

# === 10. Vehicles, Transportation, Infrastructure ===
bad_lemmas.update({
    "vehicle", "truck", "bus", "bicycle", "bike", "pipe", "plumbing", "car", "speedometer"
})

# === 11. Industrial & Work-Related ===
bad_lemmas.update({
    "industrial", "steel", "structure", "laboratory", "tin", "copper", "column",
    "concrete", "wire", "barbed", "rust", "disc", "rock", "chunk", "illuminate",
    "LED", "light", "museum", "architecture", "production", "product", "stone", "garage", "door", "cottage", "college",
    "campus", "townhouse", "homeopathy"
})

# === 12. Miscellaneous Tools & Actions ===
bad_lemmas.update({
    "shovel", "hold", "dna", "helix"
})



In [11]:
from collections import Counter
import spacy

nlp = spacy.load("en_core_web_sm")
word_counts = Counter()


def is_keep(token):
    return token.is_alpha and not token.is_stop and not token.is_punct


for alt in df_master['alt']:
    doc = nlp(alt)
    for token in doc:
        if is_keep(token):
            word_counts[token.lemma_.lower()] += 1




In [12]:
def should_reject(text: str, bad_lemmas: set, bad_phrases: set) -> dict:
    """
    Evaluates whether a given alt-text string should be rejected based on exclusion rules.

    The function checks for:
    - Non-ASCII characters
    - Empty or whitespace-only text
    - Presence of undesirable phrases (substring match)
    - Presence of undesirable lemmas or tokens (via spaCy NLP)

    Args:
        text (str): The alt-text string to evaluate.
        bad_lemmas (set): A set of lowercased lemmatized words to reject.
        bad_phrases (set): A set of exact phrases that trigger rejection.

    Returns:
        dict: A dictionary with:
            - "valid" (bool): True if the text passes all checks.
            - "reasons" (list | None): A list of reasons for rejection, or None if valid.
    """

    reasons = []

    if not all(ord(c) < 128 for c in text):
        reasons.append("non-ascii characters")

    if not text or text.strip() == "":
        reasons.append("blank text")

    text_lower = text.lower()
    for phrase in bad_phrases:
        if phrase in text_lower:
            reasons.append(f"bad phrase: '{phrase}'")

    doc = nlp(text)
    for token in doc:
        lemma = token.lemma_.lower()
        raw = token.text.lower().replace("-", "")
        if lemma in bad_lemmas:
            reasons.append(f"bad lemma: '{lemma}'")
        if raw in bad_lemmas:
            reasons.append(f"bad token: '{raw}'")

    return {"valid": len(reasons) == 0, "reasons": reasons if reasons else None}


In [13]:
def save_image_by_size(
        url: str,
        base_dir: str | None = None,
        label: str | None = None,
        alt_text: str = '',
        reasons=None
) -> None:

    """
    Downloads an image from a URL, saves it in a structured directory based on resolution,
    and writes associated metadata to a JSON file.

    Folder structure:
        [base_dir]/clean/[label]/[width]/[height]/[filename]

    Args:
        url (str): The URL of the image to download.
        base_dir (str | None): Base directory where images and metadata will be stored.
        label (str | None): Optional category or query label for organizing images.
        alt_text (str): Alt-text or caption associated with the image.
        reasons (list | None): Optional reasons for acceptance or context metadata.
                               If None, the image is considered "valid".

    Returns:
        None
    """

    filename = urlparse(url).path.split("/")[-1]
    response = requests.get(url, verify=certifi.where())
    image = Image.open(BytesIO(response.content)).convert("RGB")
    width, height = image.size

    folder_path = f'{base_dir}/clean/{label}/{width}/{height}'
    os.makedirs(folder_path, exist_ok=True)
    image.save(f'{folder_path}/{filename}')

    metadata = {
        "label": label,
        "alt_text": alt_text,
        "reasons": reasons if reasons else "valid",
        "url": url,
        "dimensions": {"width": width, "height": height},
        "filename": filename
    }

    filename_root = filename.split('.')[0]
    with open(f'{folder_path}/{filename_root}.json', 'w') as f:
        json.dump(metadata, f, indent=2)

    print(f"Saved to: {folder_path}/{filename}")

In [14]:
def save_image_to_rejects(url: str, base_dir: str | None = None, label: str | None = None,
                          alt_text: str = '', reasons: list | None = None) -> None:
    """
    Downloads an image from a URL and saves it to a 'rejected' folder with accompanying metadata.

    Folder structure:
        [base_dir]/rejected/[label]/[filename]

    Args:
        url (str): The URL of the image to download.
        base_dir (str | None): Base directory where rejected images and metadata will be stored.
        label (str | None): Optional label or query term used to organize rejected images.
        alt_text (str): Alt-text or caption associated with the image.
        reasons (list | None): List of rejection reasons. If None, defaults to ["unspecified"].

    Returns:
        None
    """

    filename = urlparse(url).path.split("/")[-1]
    response = requests.get(url, verify=certifi.where())
    image = Image.open(BytesIO(response.content)).convert("RGB")

    folder_path = f'{base_dir}/rejected/{label}'
    os.makedirs(folder_path, exist_ok=True)
    image.save(f'{folder_path}/{filename}')

    filename_root = filename.split('.')[0]
    metadata = {
        "label": label,
        "alt_text": alt_text,
        "reasons": reasons if reasons else ["unspecified"],
        "url": url,
    }

    with open(f'{folder_path}/{filename_root}.json', 'w') as f:
        json.dump(metadata, f, indent=2)

    print(f"[REJECTED] Saved to: {folder_path}/{filename}")


In [15]:
clean_count = 0
rejected_count = 0

for index, row in df_master.iterrows():
    result = should_reject(text=row['alt'], bad_lemmas=bad_lemmas, bad_phrases=bad_phrases)

    print(result)

    if not result['valid']:
        rejected_count += 1
        save_image_to_rejects(
            url=row['original'],
            base_dir=images_dir,
            label=query_label,
            alt_text=row['alt'],
            reasons=result['reasons']
        )

    else:
        clean_count += 1
        save_image_by_size(
            url=row['original'],
            base_dir=images_dir,
            label=query_label,
            alt_text=row['alt'],
        )

print(Fore.GREEN + f"\nClean images: {clean_count}")
print(Fore.RED + f"Rejected images: {rejected_count}")
print(Fore.YELLOW + f"Acceptance rate: {clean_count / (clean_count + rejected_count):.2%}\n" + Style.RESET_ALL)


{'valid': False, 'reasons': ["bad lemma: 'berry'", "bad token: 'berry'"]}
[REJECTED] Saved to: /mnt/mls/images/rejected/young sprout macro/pexels-photo-31979873.jpeg
{'valid': True, 'reasons': None}
Saved to: /mnt/mls/images/clean/young sprout macro/3456/5184/pexels-photo-31964236.jpeg
{'valid': False, 'reasons': ['blank text']}
[REJECTED] Saved to: /mnt/mls/images/rejected/young sprout macro/pexels-photo-31968576.jpeg
{'valid': True, 'reasons': None}
Saved to: /mnt/mls/images/clean/young sprout macro/3120/4160/pexels-photo-1226302.jpeg
{'valid': True, 'reasons': None}
Saved to: /mnt/mls/images/clean/young sprout macro/4019/6029/pexels-photo-4594030.jpeg
{'valid': False, 'reasons': ["bad lemma: 'artistic'", "bad token: 'artistic'", "bad lemma: 'shadow'"]}
[REJECTED] Saved to: /mnt/mls/images/rejected/young sprout macro/pexels-photo-1204941.jpeg
{'valid': False, 'reasons': ["bad lemma: 'coconut'", "bad token: 'coconut'", "bad lemma: 'shadow'"]}
[REJECTED] Saved to: /mnt/mls/images/rejec



[REJECTED] Saved to: /mnt/mls/images/rejected/young sprout macro/pexels-photo-15941642.png
{'valid': False, 'reasons': ["bad lemma: 'fresh'", "bad token: 'fresh'", "bad lemma: 'grape'"]}
[REJECTED] Saved to: /mnt/mls/images/rejected/young sprout macro/pexels-photo-9968909.jpeg
{'valid': False, 'reasons': ["bad lemma: 'fresh'", "bad token: 'fresh'", "bad lemma: 'fruit'", "bad lemma: 'food'", "bad token: 'food'"]}
[REJECTED] Saved to: /mnt/mls/images/rejected/young sprout macro/pexels-photo-6870819.jpeg
{'valid': True, 'reasons': None}
Saved to: /mnt/mls/images/clean/young sprout macro/5829/3886/pexels-photo-9536048.jpeg
{'valid': True, 'reasons': None}
Saved to: /mnt/mls/images/clean/young sprout macro/5568/3712/pexels-photo-16886877.jpeg
{'valid': False, 'reasons': ["bad lemma: 'bowl'", "bad token: 'bowl'", "bad lemma: 'food'", "bad token: 'food'"]}
[REJECTED] Saved to: /mnt/mls/images/rejected/young sprout macro/pexels-photo-5234121.jpeg
{'valid': False, 'reasons': ["bad lemma: 'ripe'

ConnectionError: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))