In [None]:
import json
from enum import Enum
from typing import Tuple, List, Dict
from pydantic import BaseModel
from openai import OpenAI
from concurrent.futures import ThreadPoolExecutor, as_completed
import os

# ——— Configuration ———
client = OpenAI(
    api_key=os.getenv("OPENAI_API_KEY")
)

# ——— 1) System prompt ———
POLICY_SYSTEM_PROMPT = (
    """You are a compliance analyst for an e-commerce marketplace. Your job is to
classify whether a product listing violates the **Children’s Drawstrings** policy.

If there is any evidence (in keywords, title, description, or images) that the item is available in youth, toddler, or children's sizes (e.g. "youth", "kids", "child", "mommy & me", "matching sets", etc.), you should assume it is intended for children and classify accordingly.
Drawstrings may appear faint, match fabric color, or be partly hidden — still flag as violating if even a decorative cord is seen.

**A listing *violates* if ALL of the following are true:**
  1. The item is an *upper-body* garment (hoodie, sweater, jacket, raincoat,
     cape, poncho, onesie, jumpsuit, **snowsuit**, **ski suit**, etc.).
  2. It contains a functional or decorative *drawstring or cord* located
     anywhere on the upper body (neck, hood, waist, hemline, etc.).
  3. The garment is intended for children age **14 or younger** (or sizes
     newborn through youth 14, e.g. 0–24 months, 2T, 3/6, youth 10, youth 14).

**A listing is *out_of_scope* if ANY of these is true:**
  • It is a bottom-only garment (pants, shorts, skirt) even if it has a drawstring.
  • It is clearly sized for teens/adults only (adult XS–2XL, etc.).
  • It is a non-clothing item (bags, hats, gift sets, toys, etc.).
  • It is baby-wearing outerwear designed for *adults* (maternity/kangaroo hoodies).

You have both the listing text and the images. Analyze the copy and the visuals carefully.

**Note:** When uncertain, it is better to err on the side of safety and flag the item as "etsy.childrens_drawstrings". A false positive (flagging adult-only incorrectly) is more acceptable than a false negative (missing a child product with drawstrings)."""
)


# ——— 2) User prompt ———
USER_INSTRUCTIONS_TEMPLATE = """
Respond with JSON exactly, containing:
- decision: "etsy.childrens_drawstrings" or "out_of_scope"
- image_inspected: true if you actually reviewed the images
- images_reviewed: list of the URLs you inspected
- image_explanations: map each URL → one-sentence note of what you saw
- rationale: overall summary of which images drove your decision

Now classify this listing:

Listing JSON:
{listing_json}

Images:
{image_urls}
"""

# ——— 3) Pydantic schema ———
class Decision(str, Enum):
    CHILDRENS    = "etsy.childrens_drawstrings"
    OUT_OF_SCOPE = "out_of_scope"

class ClassificationResponse(BaseModel):
    decision: Decision
    image_inspected: bool
    images_reviewed: List[str]
    image_explanations: Dict[str, str]
    rationale: str

# ——— 4) Helper to filter fields ———
def filter_listing_fields(listing: dict) -> dict:
    allowed_keys = {"id", "title", "description", "category", "images", "keywords"}
    return {k: v for k, v in listing.items() if k in allowed_keys}

# ——— 5) classify_listing ———
def classify_listing(listing: dict) -> ClassificationResponse:
    listing_json = json.dumps(listing, ensure_ascii=False, indent=2)
    urls = listing.get("images", [])
    md_urls = "\n".join(f"- {u}" for u in urls) or "(no images)"

    user_prompt = USER_INSTRUCTIONS_TEMPLATE.format(
        listing_json=listing_json,
        image_urls=md_urls
    )

    messages = [
        {"role": "system", "content": POLICY_SYSTEM_PROMPT}
    ]

    if urls:
        content = [{"type": "text", "text": user_prompt}]
        for url in urls:
            content.append({
                "type": "image_url",
                "image_url": {
                    "url": url,
                    "detail": "auto"
                }
            })
        messages.append({"role": "user", "content": content})
    else:
        messages.append({"role": "user", "content": user_prompt})

    response = client.chat.completions.create(
        model="gpt-4.1",
        messages=messages,
        response_format={"type": "json_object"}
    )
    return ClassificationResponse.model_validate_json(response.choices[0].message.content)

# ——— 6) classify_entry ———
def classify_entry(entry: dict) -> Tuple[str, str, str]:
    raw_listing = entry["reviewInput"]
    listing = filter_listing_fields(raw_listing)
    expected = entry["expectedOutcome"]
    resp = classify_listing(listing)
    return listing["id"], resp.decision.value, expected

# ——— 7) Evaluation ———
if __name__ == "__main__":
    def evaluate(dataset_path: str, max_workers: int = 8):
        with open(dataset_path, encoding="utf-8") as f:
            data = json.load(f)["data"]

        results = []
        with ThreadPoolExecutor(max_workers=max_workers) as pool:
            futures = [pool.submit(classify_entry, e) for e in data]
            for future in as_completed(futures):
                _id, pred, exp = future.result()
                print(json.dumps({"id": _id, "pred": pred, "exp": exp}))
                results.append((pred, exp))

        tp = sum(1 for p, e in results if p == e == "etsy.childrens_drawstrings")
        fp = sum(1 for p, e in results if p == "etsy.childrens_drawstrings" and e != p)
        fn = sum(1 for p, e in results if e == "etsy.childrens_drawstrings" and e != p)

        precision = tp / (tp + fp) if tp + fp > 0 else 0.0
        recall = tp / (tp + fn) if tp + fn > 0 else 0.0

        print(json.dumps({"precision": round(precision, 4), "recall": round(recall, 4)}))

    evaluate("drawstrings_labeled_data.json")
