In [None]:
!pip install git+https://github.com/openai/CLIP.git


In [None]:
!pip install torch torchvision torchaudio ultralytics transformers pillow numpy opencv-python


In [None]:
import torch
import clip
from PIL import Image
import cv2
import numpy as np
from ultralytics import YOLO
from transformers import BlipProcessor, BlipForConditionalGeneration
from google.colab import files

# Load CLIP model for fashion classification
device = "cuda" if torch.cuda.is_available() else "cpu"
model_clip, preprocess_clip = clip.load("ViT-B/32", device=device)

# Load YOLOv8 model for defect detection
model_yolo = YOLO("yolov8n.pt")  # Change this to the fine-tuned model if available

# Load BLIP model for image captioning
processor_blip = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model_blip = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to(device)

# Define Fashion Categories
fashion_categories = ["t-shirt", "jeans", "jacket", "dress", "shoes", "handbag"]

# Function to manually upload an image in Colab
def upload_image():
    uploaded = files.upload()  # Prompt user to upload an image
    image_path = list(uploaded.keys())[0]  # Get uploaded filename
    return image_path

# Helper Function: Classify Fashion Category using CLIP
def classify_fashion(image_path):
    image = preprocess_clip(Image.open(image_path)).unsqueeze(0).to(device)
    text_inputs = clip.tokenize(fashion_categories).to(device)
    
    with torch.no_grad():
        image_features = model_clip.encode_image(image)
        text_features = model_clip.encode_text(text_inputs)
        similarity = (image_features @ text_features.T).softmax(dim=-1)

    return fashion_categories[similarity.argmax().item()]

# Helper Function: Detect Defects using YOLOv8
def detect_defects(image_path):
    results = model_yolo(image_path)
    defects = []
    
    for result in results:
        for box in result.boxes:
            x1, y1, x2, y2 = map(int, box.xyxy[0])
            confidence = box.conf[0].item()
            class_name = result.names[int(box.cls[0])]

            if confidence > 0.5:
                defects.append((class_name, confidence, (x1, y1, x2, y2)))
    
    return defects

# Helper Function: Generate Image Caption using BLIP
def generate_caption(image_path):
    image = Image.open(image_path).convert("RGB")
    inputs = processor_blip(image, return_tensors="pt").to(device)
    
    with torch.no_grad():
        caption = model_blip.generate(**inputs)
    
    return processor_blip.decode(caption[0], skip_special_tokens=True)

# Test the Model on a Sample Image
def process_image(image_path):
    print(f"Processing: {image_path}")

    # Classify fashion item
    category = classify_fashion(image_path)
    print(f"Fashion Category: {category}")

    # Detect defects
    defects = detect_defects(image_path)
    if defects:
        print("Detected Defects:")
        for defect in defects:
            print(f" - {defect[0]} (Confidence: {defect[1]:.2f}) at {defect[2]}")
    else:
        print("No defects detected.")

    # Generate caption
    caption = generate_caption(image_path)
    print(f"Generated Caption: {caption}")

# Upload image manually in Google Colab
image_path = upload_image()

# Process the uploaded image
process_image(image_path)
