In [1]:
# Install Kaggle
!pip install -q kaggle

# # Make a directory for Kaggle and move the kaggle.json file
# !mkdir -p ~/.kaggle
# !cp kaggle.json ~/.kaggle/

# # Set permissions for the API key file
# !chmod 600 ~/.kaggle/kaggle.json

# Download the dataset
!kaggle datasets download -d iamsouravbanerjee/animal-image-dataset-90-different-animals

# Unzip the dataset
!unzip -q animal-image-dataset-90-different-animals.zip -d ./data


Dataset URL: https://www.kaggle.com/datasets/iamsouravbanerjee/animal-image-dataset-90-different-animals
License(s): other
Downloading animal-image-dataset-90-different-animals.zip to /content
 99% 648M/656M [00:08<00:00, 82.6MB/s]
100% 656M/656M [00:08<00:00, 77.3MB/s]


In [25]:
!pip install timm transformers

Collecting timm
  Downloading timm-1.0.9-py3-none-any.whl.metadata (42 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/42.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.4/42.4 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
Downloading timm-1.0.9-py3-none-any.whl (2.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.3/2.3 MB[0m [31m36.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: timm
Successfully installed timm-1.0.9


In [26]:
import torch
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import random_split, DataLoader, Dataset
from torch import nn, optim
from torchvision import datasets, models
import os
from tqdm import tqdm
import math
from PIL import Image, ImageOps
from torchvision.transforms.functional import to_pil_image
import matplotlib.pyplot as plt
import pickle
import timm

import random

Device Agnostic


In [3]:
# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [4]:
# Transform for both original and edge datasets (resize and convert to tensor)
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    # transforms.Lambda(lambda x: x.convert("RGB")),  # Convert grayscale images to RGB
    transforms.ToTensor()
])

In [5]:
# Define the path where the dataset is stored
data_dir = './data/animals/animals'

# Load the dataset (all images in the "animals" folder)
dataset = datasets.ImageFolder(root=data_dir, transform=transform,)
print(f"Dataser size: {len(dataset)}")


# Splitting the dataset into training and validation
train_size = math.ceil(0.7 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])
classes = train_dataset.dataset.classes  # List of categories


train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=0)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False, num_workers=0)

print(f"Training set size: {len(train_dataset)}")
print(f"Validation set size: {len(val_dataset)}")


Dataser size: 5400
Training set size: 3780
Validation set size: 1620


In [6]:
print(classes)
print(len(classes))

['antelope', 'badger', 'bat', 'bear', 'bee', 'beetle', 'bison', 'boar', 'butterfly', 'cat', 'caterpillar', 'chimpanzee', 'cockroach', 'cow', 'coyote', 'crab', 'crow', 'deer', 'dog', 'dolphin', 'donkey', 'dragonfly', 'duck', 'eagle', 'elephant', 'flamingo', 'fly', 'fox', 'goat', 'goldfish', 'goose', 'gorilla', 'grasshopper', 'hamster', 'hare', 'hedgehog', 'hippopotamus', 'hornbill', 'horse', 'hummingbird', 'hyena', 'jellyfish', 'kangaroo', 'koala', 'ladybugs', 'leopard', 'lion', 'lizard', 'lobster', 'mosquito', 'moth', 'mouse', 'octopus', 'okapi', 'orangutan', 'otter', 'owl', 'ox', 'oyster', 'panda', 'parrot', 'pelecaniformes', 'penguin', 'pig', 'pigeon', 'porcupine', 'possum', 'raccoon', 'rat', 'reindeer', 'rhinoceros', 'sandpiper', 'seahorse', 'seal', 'shark', 'sheep', 'snake', 'sparrow', 'squid', 'squirrel', 'starfish', 'swan', 'tiger', 'turkey', 'turtle', 'whale', 'wolf', 'wombat', 'woodpecker', 'zebra']
90


Style Transfer Dataer

In [7]:
# Download the Flickr Material Database dataset from Kaggle
!kaggle datasets download -d liewyousheng/flickr-material-database

# Unzip the dataset quietly into a 'fmd' directory
!unzip -q flickr-material-database.zip -d ./data/fmd

Dataset URL: https://www.kaggle.com/datasets/liewyousheng/flickr-material-database
License(s): unknown
Downloading flickr-material-database.zip to /content
 94% 53.0M/56.1M [00:00<00:00, 75.5MB/s]
100% 56.1M/56.1M [00:00<00:00, 63.7MB/s]


In [8]:
# Define the path where the Flickr Material Database is stored
fmd_data_dir = './data/fmd/image'

# Load the dataset
fmd_dataset = datasets.ImageFolder(root=fmd_data_dir, transform=transform)
texture_classes = fmd_dataset.classes

print(f"Dataset size: {len(fmd_dataset)}")
print(f"Texture classes: {fmd_dataset.classes}")
print(f"Number of texture classes: {len(fmd_dataset.classes)}")



Dataset size: 1000
Texture classes: ['fabric', 'foliage', 'glass', 'leather', 'metal', 'paper', 'plastic', 'stone', 'water', 'wood']
Number of texture classes: 10


Style Transfer

In [9]:
# Load pre-trained VGG19 model for style transfer
vgg = models.vgg19(pretrained=True).features.eval().to(device)


Downloading: "https://download.pytorch.org/models/vgg19-dcbb9e9d.pth" to /root/.cache/torch/hub/checkpoints/vgg19-dcbb9e9d.pth
100%|██████████| 548M/548M [00:13<00:00, 43.5MB/s]


In [10]:
# Function to extract features from the VGG model
def get_features(image, model, layers=None):
    if layers is None:
        layers = {'0': 'conv1_1', '5': 'conv2_1', '10': 'conv3_1', '19': 'conv4_1', '21': 'conv4_2', '28': 'conv5_1'}

    features = {}
    x = image
    for name, layer in model._modules.items():
        x = layer(x)
        if name in layers:
            features[layers[name]] = x
    return features

In [11]:
# Function to compute the Gram matrix (used for style loss)
def gram_matrix(tensor):
    if len(tensor.size()) == 3:  # If the tensor is missing a batch dimension
        tensor = tensor.unsqueeze(0)  # Add a batch dimension
    _, d, h, w = tensor.size()  # Unpack the dimensions
    tensor = tensor.view(d, h * w)  # Reshape the tensor to (channels, height * width)
    gram = torch.mm(tensor, tensor.t())  # Compute the Gram matrix
    return gram


In [12]:
# Apply style transfer
def apply_style_transfer(content_image, style_image, num_steps=50, style_weight=1000000, content_weight=1):
    # content_image = content_image.requires_grad_(True)
    # style_image = style_image.requires_grad_(True)

    # Ensure the images are 4D by adding a batch dimension if necessary
    if content_image.dim() == 3:
        content_image = content_image.unsqueeze(0)
    if style_image.dim() == 3:
        style_image = style_image.unsqueeze(0)

    # Move images to GPU if available
    content_image = content_image.to(device)
    style_image = style_image.to(device)

    # Pass the images through the VGG model to extract features
    content_features = get_features(content_image, vgg)
    style_features = get_features(style_image, vgg)

    # Compute the Gram matrices for the style image
    style_grams = {layer: gram_matrix(style_features[layer]) for layer in style_features}

    # Initialize the target image (content image to be optimized)
    target = content_image.clone().detach().requires_grad_(True).to(device)

    optimizer = optim.Adam([target], lr=0.01)
    loss_fn = torch.nn.MSELoss()

    for step in range(num_steps):
        # Extract the target features
        target_features = get_features(target, vgg)

        # Compute the content loss
        content_loss = content_weight * loss_fn(target_features['conv4_2'], content_features['conv4_2'])

        # Compute the style loss
        style_loss = 0
        for layer in style_grams:
            target_gram = gram_matrix(target_features[layer])
            d = target_gram.size(0)  # Since it's a 2D matrix (channels, channels)
            style_loss += style_weight * loss_fn(target_gram, style_grams[layer]) / (d * d)

        # Total loss
        total_loss = content_loss + style_loss

        # Backpropagation
        optimizer.zero_grad()
        total_loss.backward(retain_graph=True)
        optimizer.step()

        # Optionally, print the loss for each step
        # if step % 10 == 0:
        #     print(f"Step {step}, Total loss: {total_loss.item()}")

    return target.squeeze(0).to('cpu')  # Move back to CPU for further use


In [13]:
# class AugmentedValDataset(Dataset):
#     def __init__(self, val_dataset, texture_dataset, transform=None):
#         self.val_dataset = val_dataset
#         self.texture_dataset = texture_dataset
#         self.transform = transform

#     def __len__(self):
#         return len(self.val_dataset)

#     def __getitem__(self, idx):
#         # Get source image and label from validation dataset
#         source_img, source_class = self.val_dataset[idx]

#         # Randomly sample a texture image
#         texture_idx = random.randint(0, len(self.texture_dataset) - 1)
#         texture_img, texture_label = self.texture_dataset[texture_idx]

#         # Apply style transfer between the source image and the texture image
#         augmented_img = apply_style_transfer(source_img, texture_img)

#         # Convert the tensor back to a PIL image
#         augmented_img = to_pil_image(augmented_img)

#         # Apply any necessary transforms
#         if self.transform:
#             augmented_img = self.transform(augmented_img)

#         texture_label = texture_classes[texture_label]

#         # Return the augmented image, original image, source class, texture label, and texture image
#         return augmented_img, source_img, source_class, texture_label, texture_img


In [14]:
class AugmentedValDataset(Dataset):
    def __init__(self, val_dataset, texture_dataset, transform=None, load_from_file=None):
        self.val_dataset = val_dataset
        self.texture_dataset = texture_dataset
        self.transform = transform

        # If a file is provided, load the pre-augmented data from the file
        if load_from_file:
            self.preaugmented_data = self.load_data(load_from_file)
        else:
            # Pre-augment the entire dataset during initialization
            self.preaugmented_data = self.preaugment_data()

    def preaugment_data(self):
        preaugmented_data = []
        for idx in tqdm(range(len(self.val_dataset)), desc="Pre-augmenting data"):
            source_img, source_class = self.val_dataset[idx]
            texture_idx = random.randint(0, len(self.texture_dataset) - 1)
            texture_img, texture_label = self.texture_dataset[texture_idx]

            augmented_img = apply_style_transfer(source_img, texture_img)
            augmented_img = to_pil_image(augmented_img)

            if self.transform:
                augmented_img = self.transform(augmented_img)

            texture_label = texture_classes[texture_label]

            preaugmented_data.append((augmented_img, source_img, source_class, texture_label, texture_img))

        return preaugmented_data

    def save_data(self, file_path):
        """Save the pre-augmented dataset to a file."""
        with open(file_path, 'wb') as f:
            pickle.dump(self.preaugmented_data, f)

    def load_data(self, file_path):
        """Load the pre-augmented dataset from a file."""
        with open(file_path, 'rb') as f:
            return pickle.load(f)

    def __len__(self):
        return len(self.val_dataset)

    def __getitem__(self, idx):
        return self.preaugmented_data[idx]

In [15]:
# Define the path to the saved pre-augmented dataset
file_path = 'augmented_val_data.pkl'

# Check if the file exists
if os.path.exists(file_path):
    print("Loading pre-augmented data from file...")
    augmented_val_dataset = AugmentedValDataset(val_dataset, fmd_dataset, transform=transform, load_from_file=file_path)
else:
    print("Pre-augmenting data and saving to file...")
    augmented_val_dataset = AugmentedValDataset(val_dataset, fmd_dataset, transform=transform)
    augmented_val_dataset.save_data(file_path)
    print(f"Data saved to {file_path}")

# Now you can use the DataLoader
augmented_val_loader = DataLoader(augmented_val_dataset, batch_size=32, shuffle=False, num_workers=0)


Pre-augmenting data and saving to file...


Pre-augmenting data: 100%|██████████| 1620/1620 [1:16:10<00:00,  2.82s/it]


Data saved to augmented_val_data.pkl


In [17]:
from google.colab import files

# Path to the file
file_path = 'augmented_val_data.pkl'

# Download the file
files.download(file_path)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [18]:
import os

# Path to the file
file_path = 'augmented_val_data.pkl'

# Check if the file exists and print its size
if os.path.exists(file_path):
    file_size = os.path.getsize(file_path)
    print(f"File size: {file_size / (1024 * 1024):.2f} MB")  # Convert to MB for readability
else:
    print("File not found.")


File size: 2792.14 MB


In [None]:
def plot_images(dataloader):
    num_images = 12
    fig, axs = plt.subplots(num_images, 3, figsize=(10, num_images * 3))

    # Get a batch of images
    augmented_images, source_images, source_classes, texture_labels, texture_images = next(iter(dataloader))
    source_classes = [classes[i] for i in source_classes]

    for i in range(num_images):
        # Augmented image
        axs[i, 0].imshow(augmented_images[i].permute(1, 2, 0))  # Convert from (C, H, W) to (H, W, C)
        axs[i, 0].set_title(f"Augmented {i + 1} | Source label: {source_classes[i]}")
        axs[i, 0].axis('off')

        # Original image
        axs[i, 1].imshow(source_images[i].permute(1, 2, 0))  # Convert from (C, H, W) to (H, W, C)
        axs[i, 1].set_title(f"Original {i + 1} | Source label: {source_classes[i]}")
        axs[i, 1].axis('off')

        # Texture image
        axs[i, 2].imshow(texture_images[i].permute(1, 2, 0))  # Convert from (C, H, W) to (H, W, C)
        axs[i, 2].set_title(f"Texture {i + 1} | Texture class: {texture_labels[i]}")
        axs[i, 2].axis('off')

    plt.tight_layout()
    plt.show()

plot_images(augmented_val_loader)

# ResNet101

## Finetuning On Baseline Train Dataset



Importing Pretrained Model

In [20]:
resnet_model = models.resnet101(pretrained=True)

# Freeze all layers except the classification head
for param in resnet_model.parameters():
    param.requires_grad = False

# Step 4: Modify the final layer for train dataset
num_ftrs = resnet_model.fc.in_features
resnet_model.fc = nn.Linear(num_ftrs, 90)

# Move the model to the GPU if available
resnet_model = resnet_model.to(device)

# Step 5: Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(resnet_model.fc.parameters(), lr=0.001)


Downloading: "https://download.pytorch.org/models/resnet101-63fe2227.pth" to /root/.cache/torch/hub/checkpoints/resnet101-63fe2227.pth
100%|██████████| 171M/171M [00:03<00:00, 48.2MB/s]


Finetuning the model

In [21]:
num_epochs = 3
resnet_train_acc = 0

for epoch in range(num_epochs):
    resnet_model.train()
    running_loss = 0.0
    correct = 0
    total = 0

    # Wrap the train_loader with tqdm
    for inputs, labels in tqdm(train_loader, desc=f"Training Epoch {epoch+1}/{num_epochs}", unit="batch"):
        inputs, labels = inputs.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = resnet_model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {running_loss/len(train_loader):.4f}, Accuracy: {100 * correct / total:.2f}%")
    resnet_train_acc = 100 * correct / total


Training Epoch 1/3: 100%|██████████| 119/119 [01:07<00:00,  1.76batch/s]


Epoch 1/3, Loss: 2.0018, Accuracy: 61.16%


Training Epoch 2/3: 100%|██████████| 119/119 [01:07<00:00,  1.77batch/s]


Epoch 2/3, Loss: 0.4836, Accuracy: 90.45%


Training Epoch 3/3: 100%|██████████| 119/119 [00:59<00:00,  2.01batch/s]

Epoch 3/3, Loss: 0.2908, Accuracy: 94.66%





## Evaluation

Validation on Baseline Dataset's Val Split

In [22]:
resnet_baseline_val_acc = 0

resnet_model.eval()
correct = 0
total = 0
with torch.no_grad():
    # Wrap the val_loader with tqdm
    for inputs, labels in tqdm(val_loader, desc="Validating", unit="batch"):
        inputs, labels = inputs.to(device), labels.to(device)
        outputs = resnet_model(inputs)
        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

print(f"Validation Accuracy: {100 * correct / total:.2f}%")
resnet_baseline_val_acc = 100 * correct / total

Validating: 100%|██████████| 51/51 [00:27<00:00,  1.82batch/s]

Validation Accuracy: 90.19%





Validation on Modified Dataset

In [23]:
# Initializing variables for heatmap data
all_labels = []
all_predictions = []

resnet_texture_val_acc = 0

augmented_val_loader = DataLoader(augmented_val_dataset, batch_size=32, shuffle=False, num_workers=0)

resnet_model.eval()
correct = 0
total = 0
# with torch.no_grad():
    # Wrap the augmented_val_loader with tqdm
for augmented_images, source_images, source_classes, texture_labels, texture_images in tqdm(augmented_val_loader, desc="Validating", unit="batch"):
    augmented_images = augmented_images.to(device)

    outputs = resnet_model(augmented_images)
    _, predicted = torch.max(outputs, 1)

    # Accumulate labels and predictions for the confusion matrix
    all_labels.extend(source_classes)
    all_predictions.extend(predicted.cpu().numpy())

    # move predictions to cpu
    predicted = predicted.cpu()

    total += source_classes.size(0)
    correct += (predicted == source_classes).sum().item()

# Calculate and print validation accuracy
resnet_texture_val_acc = 100 * correct / total
print(f"Validation Accuracy: {resnet_texture_val_acc:.2f}%")

Validating: 100%|██████████| 51/51 [00:08<00:00,  5.84batch/s]

Validation Accuracy: 10.62%





Texture Bias for ResNet101


In [24]:
resnet_texture_bias = resnet_texture_val_acc / resnet_baseline_val_acc
resnet_texture_bias = 1- resnet_texture_bias
print(f"Texture Bias = {resnet_texture_bias:.4f}")

Texture Bias = 0.8823


# Vision Transformer

## Finetuning On Baseline Train Dataset

Importing Pretrained Model

In [27]:
vit_model = timm.create_model('vit_base_patch16_224', pretrained=True)

# Freeze the feature extractor backbone
for param in vit_model.parameters():
    param.requires_grad = False

# Replace the classifier head with a new head for 90 classes
vit_model.head = nn.Linear(vit_model.head.in_features, 90)
vit_model = vit_model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(vit_model.parameters(), lr=0.0001)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


model.safetensors:   0%|          | 0.00/346M [00:00<?, ?B/s]

Finetuning the model

In [28]:
num_epochs = 3
vit_train_acc = 0

for epoch in range(num_epochs):
    vit_model.train()
    running_loss = 0.0
    correct = 0
    total = 0

    # Wrap the train_loader with tqdm
    for inputs, labels in tqdm(train_loader, desc=f"Training Epoch {epoch+1}/{num_epochs}", unit="batch"):
        inputs, labels = inputs.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = vit_model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {running_loss/len(train_loader):.4f}, Accuracy: {100 * correct / total:.2f}%")
    vit_train_acc = 100 * correct / total



Training Epoch 1/3: 100%|██████████| 119/119 [01:28<00:00,  1.35batch/s]


Epoch 1/3, Loss: 3.6524, Accuracy: 31.83%


Training Epoch 2/3: 100%|██████████| 119/119 [01:22<00:00,  1.45batch/s]


Epoch 2/3, Loss: 1.8701, Accuracy: 87.49%


Training Epoch 3/3: 100%|██████████| 119/119 [01:19<00:00,  1.49batch/s]

Epoch 3/3, Loss: 0.8964, Accuracy: 94.42%





## Evaluation

Validation on Baseline Dataset's Val Split

In [29]:
vit_baseline_val_acc = 0

vit_model.eval()
correct = 0
total = 0
with torch.no_grad():
    # Wrap the val_loader with tqdm
    for inputs, labels in tqdm(val_loader, desc="Validating", unit="batch"):
        inputs, labels = inputs.to(device), labels.to(device)
        outputs = vit_model(inputs)
        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

print(f"Validation Accuracy: {100 * correct / total:.2f}%")
vit_baseline_val_acc = 100 * correct / total

Validating: 100%|██████████| 51/51 [00:35<00:00,  1.46batch/s]

Validation Accuracy: 95.62%





Validation on Modified Dataset

In [30]:
# Initializing variables for heatmap data
all_labels = []
all_predictions = []

vit_texture_val_acc = 0

augmented_val_loader = DataLoader(augmented_val_dataset, batch_size=32, shuffle=False, num_workers=0)

vit_model.eval()
correct = 0
total = 0
# with torch.no_grad():
    # Wrap the augmented_val_loader with tqdm
for augmented_images, source_images, source_classes, texture_labels, texture_images in tqdm(augmented_val_loader, desc="Validating", unit="batch"):
    augmented_images = augmented_images.to(device)

    outputs = vit_model(augmented_images)
    _, predicted = torch.max(outputs, 1)

    # Accumulate labels and predictions for the confusion matrix
    all_labels.extend(source_classes)
    all_predictions.extend(predicted.cpu().numpy())

    # move predictions to cpu
    predicted = predicted.cpu()

    total += source_classes.size(0)
    correct += (predicted == source_classes).sum().item()

# Calculate and print validation accuracy
vit_texture_val_acc = 100 * correct / total
print(f"Validation Accuracy: {vit_texture_val_acc:.2f}%")

Validating: 100%|██████████| 51/51 [00:18<00:00,  2.73batch/s]

Validation Accuracy: 67.35%





Texture Bias for ViT


In [31]:
vit_texture_bias = vit_texture_val_acc / vit_baseline_val_acc
vit_texture_bias = 1 - vit_texture_bias
print(f"Texture Bias = {vit_texture_bias:.4f}")

Texture Bias = 0.2957


# CLIP ViT

Importing Pretrained Model

In [32]:
from transformers import CLIPProcessor, CLIPModel


clip_model = CLIPModel.from_pretrained("openai/clip-vit-large-patch14")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14")

clip_model.to(device)

# Prepare text inputs (class prompts) for zero-shot classification
text_inputs = [f"a photo of a {class_label}" for class_label in classes]
text_inputs = processor(text=text_inputs, return_tensors="pt", padding=True).to(device)

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

config.json:   0%|          | 0.00/4.52k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.71G [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/905 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/961k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.22M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]



## Evaluation

Validation on Baseline Dataset's Val Split

In [33]:
clip_baseline_val_acc = 0

clip_model.eval()
correct = 0
total = 0
with torch.no_grad():
    # Wrap the val_loader with tqdm
    for inputs, labels in tqdm(val_loader, desc="Validating", unit="batch"):
        # inputs, labels = inputs.to(device), labels.to(device)

        # Preprocess images and move to the device
        images = [transforms.ToPILImage()(image) for image in inputs]  # Convert tensors back to PIL
        inputs = processor(images=images, return_tensors="pt", padding=True).to(device)

        # Encode the image and text inputs
        image_features = clip_model.get_image_features(**inputs)
        text_features = clip_model.get_text_features(**text_inputs)

        # Normalize the embeddings
        image_features /= image_features.norm(dim=-1, keepdim=True)
        text_features /= text_features.norm(dim=-1, keepdim=True)

        # Calculate similarity between image and text
        similarity = (100.0 * image_features @ text_features.T).softmax(dim=-1)

        # Get the predicted class (highest similarity)
        predicted_classes = similarity.argmax(dim=-1)

        # Update correct predictions
        correct += (predicted_classes == labels.to(device)).sum().item()
        total += labels.size(0)


print(f"Validation Accuracy: {100 * correct / total:.2f}%")
clip_baseline_val_acc = 100 * correct / total

Validating: 100%|██████████| 51/51 [01:38<00:00,  1.94s/batch]

Validation Accuracy: 98.09%





Validation on Modified Dataset

In [42]:
clip_texture_val_acc = 0

clip_model.eval()
correct = 0
total = 0
with torch.no_grad():
    # Wrap the val_loader with tqdm
    for augmented_images, source_images, source_classes, texture_labels, texture_images in tqdm(augmented_val_loader, desc="Validating", unit="batch"):
        # inputs, labels = inputs.to(device), labels.to(device)

        # Preprocess images and move to the device
        images = [transforms.ToPILImage()(image) for image in augmented_images]  # Convert tensors back to PIL
        inputs = processor(images=images, return_tensors="pt", padding=True).to(device)

        # Encode the image and text inputs
        image_features = clip_model.get_image_features(**inputs)
        text_features = clip_model.get_text_features(**text_inputs)

        # Normalize the embeddings
        image_features /= image_features.norm(dim=-1, keepdim=True)
        text_features /= text_features.norm(dim=-1, keepdim=True)

        # Calculate similarity between image and text
        similarity = (100.0 * image_features @ text_features.T).softmax(dim=-1)

        # Get the predicted class (highest similarity)
        predicted_classes = similarity.argmax(dim=-1)

        # Update correct predictions
        correct += (predicted_classes == source_classes.to(device)).sum().item()
        total += source_classes.size(0)

print(f"Validation Accuracy: {100 * correct / total:.2f}%")
clip_texture_val_acc = 100 * correct / total

Validating: 100%|██████████| 51/51 [01:20<00:00,  1.57s/batch]

Validation Accuracy: 63.95%





Texture Bias for CLIP

In [43]:
clip_texture_bias = clip_texture_val_acc / clip_baseline_val_acc
clip_texture_bias = 1 - clip_texture_bias
print(f"Texture Bias = {clip_texture_bias:.4f}")

Texture Bias = 0.3480


# Texture Biases

In [44]:
from tabulate import tabulate

# Assuming these are your calculated shape biases
texture_biases = {
    "Model 1: ResNet101": resnet_texture_bias,
    "Model 2: ViT": vit_texture_bias,
    "Model 3: Clip-ViT-Large": clip_texture_bias,
}

# Creating a list of lists for table format
table_data = [[model, f"{bias:.4f}"] for model, bias in texture_biases.items()]

# Define the table headers
headers = ["Model", "Texture Bias"]

# Print the table in a fancy format
print(tabulate(table_data, headers=headers, tablefmt="fancy_grid"))


╒═════════════════════════╤════════════════╕
│ Model                   │   Texture Bias │
╞═════════════════════════╪════════════════╡
│ Model 1: ResNet101      │         0.8823 │
├─────────────────────────┼────────────────┤
│ Model 2: ViT            │         0.2957 │
├─────────────────────────┼────────────────┤
│ Model 3: Clip-ViT-Large │         0.348  │
╘═════════════════════════╧════════════════╛
