<a href="https://colab.research.google.com/github/giyushino/clip-vit-large-patch14-batch/blob/main/large_batch_testing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers
!pip install torch
!pip install datasets
from transformers import CLIPProcessor, CLIPModel
import torch
from datasets import load_dataset
import torch.optim as optim
import torch.nn.functional as F
import time

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (

In [2]:
import random

In [3]:
# Load cifar10 dataset and extract labels
datasets = load_dataset("cifar10")

labels = datasets["train"].features["label"].names
label2id, id2label = dict(), dict()
for i, label in enumerate(labels):
    label2id[label] = i
    id2label[i] = label

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/5.16k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/120M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/23.9M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/50000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/10000 [00:00<?, ? examples/s]

In [4]:
#Load model and processor from Hugging Face's transformers library
model = CLIPModel.from_pretrained("openai/clip-vit-large-patch14")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14")

config.json:   0%|          | 0.00/4.52k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.71G [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/905 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/961k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.22M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]



In [5]:
# Function to batch large datasets into smaller groups for easier computation
def homemade_batch(num_img, batch_size=10, start_img=0, data_type = "test"):
    # Initialize empty set to store predicted values and their probabilities
    homemade = []
    num_batches = num_img // batch_size
    extra = num_img % batch_size # Not implemented yet

    # Allows computations to be run on GPU instead of CPU
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    t_0 = time.perf_counter()

    for i in range(num_batches):
        t1 = time.perf_counter()

        # Create a temporary batch of data
        subset = datasets[data_type].select(range((i * batch_size + start_img), (i + 1) * batch_size + start_img))
        input = processor(text=labels, images=subset["img"], return_tensors="pt", padding=False).to(device)
        output = model(**input)

        # Access logits of the input images, apply softmax function
        logits = output.logits_per_image
        probs = logits.softmax(dim=1)

        # Find maximum of the probabilities, as well as their corresponding index, append them to list
        max_prob, max_id = probs.max(dim=1)
        homemade.append([max_prob.cpu().detach(), max_id.cpu().detach()])
        torch.cuda.empty_cache()

        if i % 50 == 0:
          t2 = time.perf_counter()
          print(f"Finished batch {i + 1} of {num_batches} in {t2 - t1} seconds")

    t_3 = time.perf_counter()
    print(f"Finished entire dataset in {t_3 - t_0} seconds")

    # Returns list of tensors, structure is [[tensor([first batch maximum probabilities]), tensor([corresponding indices/labels])],
    #                                         [tensor([second batch maximum probabilities]), tensor([corresponding indices/labels])],
    #                                         [tensor([third batch maximum probabilities]), tensor([corresponding indices/labels])]]
    return homemade

In [6]:
# Takes output of homemade_batch as input and returns clean data
def prediction_reformat(subset):
    # Initialize empty list to store new reformatted data
    predicted = []
    count = 0

    # len(subset) = number of batches
    for i in range(len(subset)):
        for k in range(len(subset[0][0])):
            prob = subset[i][0][k].item()
            id = subset[i][1][k].item()

            label = id2label[id]
            predicted.append([count, label, prob, id])

            count += 1

    # Returns nested list with form [[index, "label", probability, id],
    #                                [index, "label", probability, id]]
    return predicted

In [7]:
# Computes how accurate the model is
def accuracy(result, data_type = "test"):
    correct = 0
    total = 0

    # Create dictionary to count how many of each label occurs in the subset, all labels initialized to 0
    all_labels = {}
    for label in datasets[data_type].features["label"].names:
        all_labels[label] = 0

    # Dictionary to keep track of which classes were incorrectly predicted
    incorrect = {}
    for label in datasets[data_type].features["label"].names:
        incorrect[label] = 0

    # Iterate through the results for each image in the subset
    for i in range(len(result)):
        # Automatically increase count of label in dictionary for appearing
        all_labels[result[i][1]] += 1

        # If the actual id/label aligns with the predicted one, add to correct count
        if datasets[data_type][i]["label"] == result[i][3]:
            correct += 1
            total += 1
            if total % 50 == 0:
              print(f"Model accurately predicted {result[i][1]} with {result[i][2] * 100}% confidence.")
        else:
            # If they do not align, increase count of predicted id/label in incorrect dictionary
            total += 1
            if total % 50 == 0:
              print(f"Model inaccurately predicted {result[i][1]} with {result[i][2] * 100}% confidence.")
            incorrect[result[i][1]] += 1

    print(f"Accuracy: {(correct/total) * 100}%")

    worst_accuracy = []
    # For every label, calculate percentage predicted correctly by subtracting total by incorrect
    for label in all_labels:
        correct =  all_labels[label] - incorrect[label]
        total = all_labels[label]
        print(f"For {label}: Predicted {correct} out of {total} correct. {(correct) / total * 100}% Accuracy")
        worst_accuracy.append([label, correct/total])

    worst_group = min(worst_accuracy, key=lambda x: x[1])
    print(f"The worst performing group is '{worst_group[0]}' with an accuracy of {worst_group[1] * 100}%")

In [8]:
def data_analysis(predictions, data_type = "test"):
    cleaned = prediction_reformat(predictions)
    final_results = accuracy(cleaned, data_type)

    return final_results

In [9]:
# Train model on training dataset
def train(num_img, batch_size=10, num_epoch=2):
    # Set up training parameters
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    optimizer = optim.Adam(model.parameters(), lr=1e-5)
    total_loss = 0
    model.to(device)
    model.train()


    for epoch in range(num_epoch):
        t0 = time.perf_counter()
        epoch_loss = 0

        # Separate training data into smaller batches
        for i in range(num_img // batch_size):
            train_set = datasets["train"].select(range(i * batch_size, (i + 1) * batch_size))
            t1 = time.perf_counter()

            # Process the data, feed it into the model
            input = processor(text=labels, images=train_set["img"], return_tensors="pt", padding=False).to(device)
            output = model(**input)

            # Get the logit for the predictions on the image and text
            logits_per_image = output.logits_per_image
            logits_per_text = output.logits_per_text
            # Turn this tensor from batch_size x 1 matrix to 1 x batch_size (doesn't work otherwise)
            logits_per_text = logits_per_text.squeeze()

            # Accesses the ground truth
            targets = torch.tensor(train_set["label"]).to(device)

            # Uses the cross-loss entropy function to calculate the loss of the images and text, utilizes softmax activation
            loss_img = F.cross_entropy(logits_per_image, targets)
            loss_text = F.cross_entropy(logits_per_text, targets)

            # Calculate the total loss
            loss = (loss_img + loss_text) / 2
            t2 = time.perf_counter()
            if i % 50 == 0:
              print(f"Finished batch {i + 1}/{num_img // batch_size} in {t2 - t1} seconds")

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            epoch_loss += loss.item()

        t3 = time.perf_counter()
        total_loss += epoch_loss
        avg_loss = epoch_loss / (num_img // batch_size)
        print(f"Epoch {epoch+1}/{num_epoch} completed in {t3 - t1} seconds, Loss: {avg_loss:.4f}")

In [10]:
import torch
import torch.optim as optim
import torch.nn.functional as F
import time
import random

def train_shuffled(num_img, batch_size=10, num_epoch=2):
    # Set up training parameters
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    optimizer = optim.Adam(model.parameters(), lr=1e-5)
    total_loss = 0
    model.to(device)
    model.train()

    for epoch in range(num_epoch):
        t0 = time.perf_counter()
        epoch_loss = 0

        # Shuffle the entire training dataset once per epoch
        shuffled_dataset = datasets["train"].shuffle(seed=random.randint(0, 1000))

        # Separate training data into smaller batches
        for i in range(num_img // batch_size):
            # Select the current batch from the shuffled dataset
            train_set = shuffled_dataset.select(range(i * batch_size, (i + 1) * batch_size))
            t1 = time.perf_counter()

            # Process the data, feed it into the model
            input = processor(text=labels, images=train_set["img"], return_tensors="pt", padding=False).to(device)
            output = model(**input)

            # Get the logits for predictions on the image and text
            logits_per_image = output.logits_per_image
            logits_per_text = output.logits_per_text.squeeze()  # Make sure it's the correct shape

            # Access ground truth
            targets = torch.tensor(train_set["label"]).to(device)

            # Calculate loss
            loss_img = F.cross_entropy(logits_per_image, targets)
            loss_text = F.cross_entropy(logits_per_text, targets)
            loss = (loss_img + loss_text) / 2
            t2 = time.perf_counter()
            print(f"Finished batch {i + 1}/{num_img // batch_size} in {t2 - t1} seconds")

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            epoch_loss += loss.item()

        total_loss += epoch_loss
        avg_loss = epoch_loss / (num_img // batch_size)
        t3 = time.perf_counter()
        print(f"Epoch {epoch+1}/{num_epoch} completed in {t3 - t0} seconds, Loss: {avg_loss:.4f}")


In [11]:
pretrained = homemade_batch(2500)
data_analysis(pretrained, data_type = "test")

train_shuffled(5000, num_epoch=5)

model.eval()

trained = homemade_batch(2500)
data_analysis(trained, data_type = "test")

Finished batch 1 of 250 in 2.4558167170000047 seconds
Finished batch 51 of 250 in 0.6429177919999916 seconds
Finished batch 101 of 250 in 0.6636242339999967 seconds
Finished batch 151 of 250 in 0.6833492279999973 seconds
Finished batch 201 of 250 in 0.667208842999969 seconds
Finished entire dataset in 168.604642221 seconds
Model accurately predicted frog with 99.14552569389343% confidence.
Model accurately predicted horse with 98.48936200141907% confidence.
Model accurately predicted bird with 98.03536534309387% confidence.
Model accurately predicted ship with 90.92010259628296% confidence.
Model accurately predicted bird with 95.06943225860596% confidence.
Model accurately predicted frog with 99.66448545455933% confidence.
Model accurately predicted truck with 99.64094758033752% confidence.
Model accurately predicted cat with 89.27355408668518% confidence.
Model accurately predicted bird with 97.3240613937378% confidence.
Model accurately predicted airplane with 58.127784729003906% co