In [None]:
# try to use text prompts to find cigarette / wine bottle / beer can... Great for hackathon demo and bootstrapping labels when I don’t have data yet
# But find out it's not very accurate => abandon this approach

%pip install --upgrade pip

# PyTorch (CUDA 12.1 wheels). If your CUDA is different, see note below.
#%pip install --index-url https://download.pytorch.org/whl/cu121 torch torchvision torchaudio

# Ultralytics YOLO + essentials
%pip install ultralytics opencv-python tqdm matplotlib numpy pyyaml pandas

# Transformers for the OWL-ViT open-vocabulary demo
%pip install transformers accelerate timm pillow


In [None]:
import torch, subprocess, sys
print("PyTorch:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("CUDA device:", torch.cuda.get_device_name(0))
print("\n=== nvidia-smi ===")
try:
    print(subprocess.check_output(["nvidia-smi"]).decode())
except Exception as e:
    print("nvidia-smi not available:", e, file=sys.stderr)

In [None]:
from PIL import Image, ImageDraw
import torch
from transformers import OwlViTProcessor, OwlViTForObjectDetection
from pathlib import Path

device = "cuda" if torch.cuda.is_available() else "cpu"
model_id = "google/owlvit-base-patch32"
processor = OwlViTProcessor.from_pretrained(model_id)
model = OwlViTForObjectDetection.from_pretrained(model_id).to(device)

# ⬇️ replace with your test image
test_image_path = str(Path("~/images/alochol2.jpeg").expanduser())

image = Image.open(test_image_path).convert("RGB")
texts = [["car plate","number plate","license plate","cigarette","smoking","alcohol","wine bottle","beer can"]]

inputs = processor(text=texts, images=image, return_tensors="pt").to(device)
with torch.no_grad():
    outputs = model(**inputs)

target_sizes = torch.tensor([image.size[::-1]]).to(device)  # (h, w)
results = processor.post_process_object_detection(outputs=outputs, target_sizes=target_sizes)[0]

draw = image.copy()
draw_ctx = ImageDraw.Draw(draw)
W, H = image.size

detections = []
for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
    score = float(score.item())
    if score < 0.20:
        continue
    ymin, xmin, ymax, xmax = [float(x) for x in box.tolist()]
    detections.append({
        "label": texts[0][int(label)],
        "score": score,
        "xyxy": [xmin, ymin, xmax, ymax]
    })
    draw_ctx.rectangle([(xmin, ymin), (xmax, ymax)], outline="red", width=3)
    draw_ctx.text((xmin, max(0, ymin-10)), f"{texts[0][int(label)]} {score:.2f}", fill="red")

display(draw)
print("Detections:", detections)
