In [2]:
"""
API with JSON Validation using Pydantic - Complete Solution
Validate JSON input using Pydantic before processing
"""

from pydantic import BaseModel, Field, field_validator
from typing import Optional
import json

print("="*50)
print("PYDANTIC BASICS")
print("="*50)

class SimpleProduct(BaseModel):
    """A simple product model for validation."""
    name: str
    price: float
    quantity: int = 1  # Default value
    
    @field_validator('price')
    def price_must_be_positive(cls, v):
        """Validate that price is positive."""
        if v <= 0:
            raise ValueError('Price must be positive')
        return v
    
    @field_validator('quantity')
    def quantity_must_be_positive(cls, v):
        """Validate that quantity is positive."""
        if v <= 0:
            raise ValueError('Quantity must be positive')
        return v

# Test validation
print("\n1. Valid data:")
try:
    product1 = SimpleProduct(name="Widget", price=10.99, quantity=5)
    print(f"  ✓ Valid: {product1.name} - ${product1.price}")
except Exception as e:
    print(f"  ✗ Error: {e}")

print("\n2. Invalid data (negative price):")
try:
    product2 = SimpleProduct(name="Widget", price=-10.99)
except Exception as e:
    print(f"  ✗ Validation error (expected): {e}")

print("\n✓ Pydantic basics working!")


PYDANTIC BASICS

1. Valid data:
  ✓ Valid: Widget - $10.99

2. Invalid data (negative price):
  ✗ Validation error (expected): 1 validation error for SimpleProduct
price
  Value error, Price must be positive [type=value_error, input_value=-10.99, input_type=float]
    For further information visit https://errors.pydantic.dev/2.12/v/value_error

✓ Pydantic basics working!


In [None]:
import os
import json
import time
from datetime import datetime
from pathlib import Path
from typing import Any, Optional, List, Dict, Tuple

import pandas as pd
from openai import OpenAI
from pydantic import BaseModel, Field, ValidationError, ConfigDict, field_validator


# =========================
# Setup
# =========================
api_key = os.getenv("OPENAI_API_KEY")
if not api_key:
    raise RuntimeError(
        "OPENAI_API_KEY is not set. Set it in your environment before running."
    )

client = OpenAI(api_key=api_key)

OUTPUT_DIR = Path("generated_listings")
OUTPUT_DIR.mkdir(exist_ok=True)


# =========================
# Pydantic Models (match your dataset columns)
# =========================
class ProductInput(BaseModel):
    model_config = ConfigDict(extra="forbid")

    id: int
    gender: str = Field(..., min_length=1)
    masterCategory: str = Field(..., min_length=1)
    subCategory: str = Field(..., min_length=1)
    articleType: str = Field(..., min_length=1)
    baseColour: str = Field(..., min_length=1)
    season: str = Field(..., min_length=1)
    year: Optional[float] = None
    usage: str = Field(..., min_length=1)
    productDisplayName: str = Field(..., min_length=1)

    # We validate that image exists. You will supply image_base64 separately.
    image: Any

    @field_validator(
        "gender",
        "masterCategory",
        "subCategory",
        "articleType",
        "baseColour",
        "season",
        "usage",
        "productDisplayName",
    )
    @classmethod
    def strip_required_strings(cls, v: str) -> str:
        v = v.strip()
        if not v:
            raise ValueError("must not be empty")
        return v

    @field_validator("year")
    @classmethod
    def validate_year(cls, v: Optional[float]) -> Optional[float]:
        if v is None:
            return None
        if v < 1900 or v > 2100:
            raise ValueError("year must be between 1900 and 2100")
        return v


class ListingOutput(BaseModel):
    model_config = ConfigDict(extra="forbid")

    title: str = Field(..., min_length=5)
    description: str = Field(..., min_length=20)
    features: List[str] = Field(..., min_length=3, max_length=10)
    keywords: List[str] = Field(default_factory=list, max_length=20)

    @field_validator("title", "description")
    @classmethod
    def strip_text(cls, v: str) -> str:
        v = v.strip()
        if not v:
            raise ValueError("must not be empty")
        return v

    @field_validator("features", "keywords")
    @classmethod
    def ensure_list_of_strings(cls, v: List[Any]) -> List[str]:
        if not isinstance(v, list):
            raise ValueError("must be a list")
        out: List[str] = []
        for item in v:
            if not isinstance(item, str):
                raise ValueError("all items must be strings")
            s = item.strip()
            if s:
                out.append(s)
        return out


# =========================
# Helpers
# =========================
def safe_json_loads(text: str) -> dict:
    """
    Tries to parse JSON strictly. If the model adds extra text, tries to extract JSON block.
    """
    text = text.strip()

    try:
        return json.loads(text)
    except json.JSONDecodeError:
        pass

    start = text.find("{")
    end = text.rfind("}")
    if start != -1 and end != -1 and end > start:
        candidate = text[start : end + 1]
        return json.loads(candidate)

    raise json.JSONDecodeError("No JSON object found", text, 0)


def product_from_row(row: pd.Series) -> ProductInput:
    """
    Validates a DataFrame row against the dataset-based ProductInput model.
    """
    payload = {
        "id": row.get("id"),
        "gender": row.get("gender"),
        "masterCategory": row.get("masterCategory"),
        "subCategory": row.get("subCategory"),
        "articleType": row.get("articleType"),
        "baseColour": row.get("baseColour"),
        "season": row.get("season"),
        "year": row.get("year"),
        "usage": row.get("usage"),
        "productDisplayName": row.get("productDisplayName"),
        "image": row.get("image"),
    }
    return ProductInput.model_validate(payload)


def image_to_base64(image_value: Any) -> str:
    """
    Converts the 'image' column content into a base64 string.

    Supports:
    - PIL.Image.Image
    - numpy arrays (H, W, C)
    - bytes / bytearray
    - file paths (str or Path)
    - already-base64 strings
    """
    import base64
    from io import BytesIO

    if image_value is None:
        raise ValueError("image is missing")

    # 1) PIL Image
    try:
        from PIL import Image  # type: ignore
        if isinstance(image_value, Image.Image):
            buf = BytesIO()
            # Save as JPEG to keep it compact; change to PNG if you need lossless
            image_value.save(buf, format="JPEG")
            return base64.b64encode(buf.getvalue()).decode("utf-8")
    except Exception:
        # If PIL isn't available or something unexpected happens, continue
        pass

    # 2) numpy array
    try:
        import numpy as np  # type: ignore
        if isinstance(image_value, np.ndarray):
            try:
                from PIL import Image  # type: ignore
            except Exception as e:
                raise ValueError("Got numpy image but PIL is not installed") from e

            arr = image_value
            if arr.ndim == 2:
                mode = "L"
            elif arr.ndim == 3 and arr.shape[2] == 3:
                mode = "RGB"
            elif arr.ndim == 3 and arr.shape[2] == 4:
                mode = "RGBA"
            else:
                raise ValueError(f"Unsupported numpy image shape: {arr.shape}")

            img = Image.fromarray(arr.astype("uint8"), mode=mode)
            buf = BytesIO()
            img.save(buf, format="JPEG")
            return base64.b64encode(buf.getvalue()).decode("utf-8")
    except Exception:
        pass

    # 3) bytes
    if isinstance(image_value, (bytes, bytearray)):
        return base64.b64encode(image_value).decode("utf-8")

    # 4) str or Path
    if isinstance(image_value, (str, Path)):
        s = str(image_value).strip()
        if not s:
            raise ValueError("image path/string is empty")

        # Heuristic: treat as base64 if it looks like base64
        if len(s) > 200 and all(c.isalnum() or c in "+/=\n\r" for c in s[:200]):
            return s.replace("\n", "").replace("\r", "")

        p = Path(s)
        if not p.exists() or not p.is_file():
            raise ValueError(f"image path does not exist: {p}")

        content = p.read_bytes()
        return base64.b64encode(content).decode("utf-8")

    raise ValueError(f"Unsupported image type: {type(image_value)}")



def create_product_listing_prompt(product: ProductInput) -> str:
    """
    Prompt for fashion product listing generation based on your dataset fields.
    """
    year_text = str(int(product.year)) if product.year is not None else "N/A"

    return f"""
You are an expert ecommerce copywriter for fashion products.
Return valid JSON only. No markdown, no extra text.

Product data:
- id: {product.id}
- name: {product.productDisplayName}
- gender: {product.gender}
- masterCategory: {product.masterCategory}
- subCategory: {product.subCategory}
- articleType: {product.articleType}
- baseColour: {product.baseColour}
- season: {product.season}
- year: {year_text}
- usage: {product.usage}

Output JSON schema:
{{
  "title": "string",
  "description": "string",
  "features": ["string", "string", "string"],
  "keywords": ["string"]
}}

Rules:
- Output must be valid JSON
- Title should include key product attributes (type, color, usage)
- Description should be 2 to 4 sentences
- Features must be 3 to 10 bullet-style strings
- Keywords should include search-friendly terms (type, color, season, gender, usage)
""".strip()


def call_openai_for_listing(prompt: str, image_base64: str, model: str = "gpt-4.1-mini") -> dict:
    """
    Sends image + prompt, returns validated JSON dict.
    """
    if not image_base64 or not isinstance(image_base64, str):
        raise ValueError("Missing or invalid image_base64")

    data_url = f"data:image/jpeg;base64,{image_base64}"

    response = client.responses.create(
        model=model,
        input=[
            {
                "role": "user",
                "content": [
                    {"type": "input_text", "text": prompt},
                    {"type": "input_image", "image_url": data_url},
                ],
            }
        ],
        temperature=0.7,
    )

    raw_text = response.output_text
    if not raw_text:
        raise RuntimeError("Empty response.output_text")

    parsed = safe_json_loads(raw_text)

    listing = ListingOutput.model_validate(parsed)
    return listing.model_dump()


def generate_listings_batch(
    products_df: pd.DataFrame,
    n_products: int = 10,
    sleep_seconds: float = 0.5,
) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    Validates incoming product rows, generates listings, validates outputs, saves results.
    """
    results: List[Dict[str, Any]] = []
    errors: List[Dict[str, Any]] = []

    run_id = datetime.now().strftime("%Y%m%d_%H%M%S")
    df_slice = products_df.head(n_products) if n_products else products_df

    for idx, row in df_slice.iterrows():
        fallback_name = row.get("productDisplayName") or f"product_{idx}"

        try:
            # 1) Validate row (input validation)
            product = product_from_row(row)

            # 2) Convert image to base64
            image_b64 = image_to_base64(product.image)

            # 3) Create prompt
            prompt = create_product_listing_prompt(product)

            # 4) Call API + output validation
            listing = call_openai_for_listing(prompt=prompt, image_base64=image_b64)

            results.append(
                {
                    "index": idx,
                    "id": product.id,
                    "productDisplayName": product.productDisplayName,
                    "gender": product.gender,
                    "masterCategory": product.masterCategory,
                    "subCategory": product.subCategory,
                    "articleType": product.articleType,
                    "baseColour": product.baseColour,
                    "season": product.season,
                    "year": product.year,
                    "usage": product.usage,
                    "title": listing["title"],
                    "description": listing["description"],
                    "features": listing["features"],
                    "keywords": listing["keywords"],
                }
            )

            print(f"✓ [{idx}] Generated listing for: {product.productDisplayName}")

        except ValidationError as e:
            errors.append(
                {
                    "index": idx,
                    "id": row.get("id"),
                    "productDisplayName": fallback_name,
                    "error_type": "validation_error",
                    "error": json.dumps(e.errors(), ensure_ascii=False),
                }
            )
            print(f"⚠ [{idx}] Validation failed for {fallback_name}: {e.errors()}")

        except Exception as e:
            errors.append(
                {
                    "index": idx,
                    "id": row.get("id"),
                    "productDisplayName": fallback_name,
                    "error_type": "runtime_error",
                    "error": str(e),
                }
            )
            print(f"⚠ [{idx}] Failed for {fallback_name}: {e}")

        time.sleep(sleep_seconds)

    results_df = pd.DataFrame(results)
    errors_df = pd.DataFrame(errors)

    results_path_jsonl = OUTPUT_DIR / f"listings_{run_id}.jsonl"
    results_path_csv = OUTPUT_DIR / f"listings_{run_id}.csv"
    errors_path_csv = OUTPUT_DIR / f"errors_{run_id}.csv"

    with results_path_jsonl.open("w", encoding="utf-8") as f:
        for r in results:
            f.write(json.dumps(r, ensure_ascii=False) + "\n")

    if not results_df.empty:
        results_df.to_csv(results_path_csv, index=False)

    if not errors_df.empty:
        errors_df.to_csv(errors_path_csv, index=False)

    print("\nBatch complete!")
    print(f"Saved listings JSONL: {results_path_jsonl}")
    if not results_df.empty:
        print(f"Saved listings CSV:   {results_path_csv}")
    if not errors_df.empty:
        print(f"Saved errors CSV:     {errors_path_csv}")

    return results_df, errors_df


# =========================
# Run 
# =========================

from datasets import load_dataset

# Load dataset from HuggingFace
print("Loading product dataset...")
try:
    # Try loading the dataset
    dataset = load_dataset("ashraq/fashion-product-images-small", split="train[:100]")  # First 100 samples
    print(f"✓ Loaded {len(dataset)} products")
    
    # Convert to pandas for easier manipulation
    products_df = pd.DataFrame(dataset)
    print(f"Dataset columns: {products_df.columns.tolist()}")
    
except Exception as e:
    print(f"⚠ Could not load HuggingFace dataset: {e}")
    print("Using local images instead...")


results_df, errors_df = generate_listings_batch(products_df, n_products=10, sleep_seconds=0.5)

print("\nResults preview:")
print(results_df.head(3))

print("\nErrors preview:")
print(errors_df.head(3))


Loading product dataset...
✓ Loaded 100 products
Dataset columns: ['id', 'gender', 'masterCategory', 'subCategory', 'articleType', 'baseColour', 'season', 'year', 'usage', 'productDisplayName', 'image']
⚠ [0] Failed for Turtle Check Men Navy Blue Shirt: Unsupported image type: <class 'PIL.JpegImagePlugin.JpegImageFile'>
⚠ [1] Failed for Peter England Men Party Blue Jeans: Unsupported image type: <class 'PIL.JpegImagePlugin.JpegImageFile'>
⚠ [2] Failed for Titan Women Silver Watch: Unsupported image type: <class 'PIL.Image.Image'>
⚠ [3] Failed for Manchester United Men Solid Black Track Pants: Unsupported image type: <class 'PIL.JpegImagePlugin.JpegImageFile'>
⚠ [4] Failed for Puma Men Grey T-shirt: Unsupported image type: <class 'PIL.Image.Image'>
⚠ [5] Failed for Inkfruit Mens Chain Reaction T-shirt: Unsupported image type: <class 'PIL.JpegImagePlugin.JpegImageFile'>
⚠ [6] Failed for Fabindia Men Striped Green Shirt: Unsupported image type: <class 'PIL.JpegImagePlugin.JpegImageFile'>
