# Vision Transformer Classifier + Novelty Detector with Masked Input and Diffusion-Augmentation

In [0]:
# Install required libraries for deep learning, vision models, and utilities.
!pip install torch torchvision timm transformers diffusers opencv-python scikit-learn matplotlib albumentations umap-learn seaborn

In [0]:
# Import core libraries for modeling, image handling, and metrics.

import os
import torch
import torchvision.transforms as T
import numpy as np
import cv2
import matplotlib.pyplot as plt
from PIL import Image
from torchvision.datasets import ImageFolder
from torch.utils.data import DataLoader, Dataset
from transformers import ViTForImageClassification, ViTFeatureExtractor
from diffusers import StableDiffusionImg2ImgPipeline
from sklearn.metrics import roc_auc_score, confusion_matrix, accuracy_score


In [0]:
# Define masking function to simulate occluded images using a donut-style mask.

def apply_donut_mask(image, inner_ratio=0.25, outer_ratio=0.75):
    h, w = image.shape[:2]
    mask = np.zeros((h, w), dtype=np.uint8)
    cv2.circle(mask, (w // 2, h // 2), int(w * outer_ratio / 2), 255, -1)
    cv2.circle(mask, (w // 2, h // 2), int(w * inner_ratio / 2), 0, -1)
    return cv2.bitwise_and(image, image, mask=mask)


In [0]:
# Load a pre-trained Stable Diffusion model for augmenting masked images with generative realism.

pipe = StableDiffusionImg2ImgPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16)
pipe = pipe.to("cuda")

def generate_diffused_image(image_pil, prompt="a tennis ball", strength=0.75, guidance_scale=7.5):
    image = image_pil.resize((512, 512))
    result = pipe(prompt=prompt, image=image, strength=strength, guidance_scale=guidance_scale)
    return result.images[0]


## Step 4: Generate and Save Augmented Dataset

In [0]:
# Generate masked images and produce their diffused versions for training a robust model.

from glob import glob
from tqdm import tqdm

raw_folder = "data/raw/"
masked_folder = "data/masked/"
diffused_folder = "data/diffused/"

os.makedirs(masked_folder, exist_ok=True)
os.makedirs(diffused_folder, exist_ok=True)

image_paths = glob(f"{raw_folder}/*.jpg")

for path in tqdm(image_paths):
    img = cv2.imread(path)
    masked = apply_donut_mask(img)
    name = os.path.basename(path)
    cv2.imwrite(f"{masked_folder}/{name}", masked)

    pil_img = Image.fromarray(cv2.cvtColor(masked, cv2.COLOR_BGR2RGB))
    generated = generate_diffused_image(pil_img, prompt="a tennis ball")
    generated.save(f"{diffused_folder}/{name}")


## Step 5: Load Data for ViT Classifier

In [0]:
# Create a custom PyTorch dataset class and data loader using the diffused dataset.

class CustomImageDataset(Dataset):
    def __init__(self, folder, labels, transform):
        self.paths = glob(folder + "/*.jpg")
        self.labels = [labels[os.path.basename(p)] for p in self.paths]
        self.transform = transform

    def __getitem__(self, idx):
        img = Image.open(self.paths[idx]).convert("RGB")
        return self.transform(img), self.labels[idx]

    def __len__(self):
        return len(self.paths)

labels = {os.path.basename(p): 0 for p in image_paths}  # Dummy labels
transform = T.Compose([T.Resize((224, 224)), T.ToTensor()])
dataset = CustomImageDataset(diffused_folder, labels, transform)
loader = DataLoader(dataset, batch_size=8, shuffle=True)


## Step 6: Train ViT Classifier

In [0]:
# Fine-tune a pre-trained Vision Transformer on the augmented dataset.

model = ViTForImageClassification.from_pretrained("google/vit-base-patch16-224-in21k", num_labels=3)
model = model.to("cuda")
optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)
loss_fn = torch.nn.CrossEntropyLoss()

for epoch in range(3):
    model.train()
    for imgs, labels in loader:
        imgs, labels = imgs.to("cuda"), torch.tensor(labels).to("cuda")
        outputs = model(imgs).logits
        loss = loss_fn(outputs, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch+1} Loss: {loss.item():.4f}")


## Step 7: Novelty Detection Logic

In [0]:
# Predict and detect novelty by measuring classification confidence against a threshold.

def predict_with_novelty(image, threshold=0.6):
    model.eval()
    with torch.no_grad():
        inputs = transform(image).unsqueeze(0).to("cuda")
        logits = model(inputs).logits
        probs = torch.softmax(logits, dim=1).cpu().numpy()[0]
        top_prob = np.max(probs)
        if top_prob < threshold:
            return "Novel", top_prob
        else:
            return f"Class {np.argmax(probs)}", top_prob

test_img = Image.open("some_test_image.jpg")
pred, conf = predict_with_novelty(test_img)
print(f"Prediction: {pred}, Confidence: {conf:.2f}")


## Step 8: Visualize Embeddings with t-SNE

In [0]:
# Visualize the learned embedding space using t-SNE for interpretability.

from sklearn.manifold import TSNE
import seaborn as sns

embeddings, labels_list = [], []
model.eval()
for imgs, lbls in loader:
    imgs = imgs.to("cuda")
    with torch.no_grad():
        feats = model.vit(imgs).last_hidden_state[:, 0, :]  # CLS token
        embeddings.append(feats.cpu().numpy())
        labels_list += lbls
embeddings = np.concatenate(embeddings)

tsne = TSNE(n_components=2)
proj = tsne.fit_transform(embeddings)

sns.scatterplot(x=proj[:, 0], y=proj[:, 1], hue=labels_list)
plt.title("ViT Embedding Space (t-SNE)")
plt.show()
