In [1]:
from __future__ import print_function
from __future__ import division
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import torchvision
from torchvision import datasets, models, transforms
from transformers import AutoImageProcessor, ViTModel
import matplotlib.pyplot as plt
import time
import os
import copy
print("PyTorch Version: ",torch.__version__)
print("Torchvision Version: ",torchvision.__version__)
import pandas as pd
from typing import *

PyTorch Version:  2.0.1
Torchvision Version:  0.15.2


In [2]:
num_gpus: int = torch.cuda.device_count()
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [3]:
model_name = "resnet"
means = None

batch_size = 8
#binary case
num_classes_binary = 2
#multi class
num_classes_category = 37

In [4]:
from datasets import load_dataset

dataset = load_dataset("pcuenq/oxford-pets")

Downloading metadata:   0%|          | 0.00/1.10k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/565 [00:00<?, ?B/s]

Downloading and preparing dataset None/None (download: 223.20 MiB, generated: 223.84 MiB, post-processed: Unknown size, total: 447.04 MiB) to C:/Users/gianv/.cache/huggingface/datasets/pcuenq___parquet/pcuenq--oxford-pets-43375e644eed3d52/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/234M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/7390 [00:00<?, ? examples/s]

Dataset parquet downloaded and prepared to C:/Users/gianv/.cache/huggingface/datasets/pcuenq___parquet/pcuenq--oxford-pets-43375e644eed3d52/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

In [5]:
dataset['train']

Dataset({
    features: ['path', 'label', 'dog', 'image'],
    num_rows: 7390
})

# Feature extractor

In [6]:
from transformers import ViTImageProcessor

model_name_or_path = 'google/vit-base-patch16-224-in21k'
feature_extractor = ViTImageProcessor.from_pretrained(model_name_or_path)

Downloading (â€¦)rocessor_config.json:   0%|          | 0.00/160 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [61]:
def transform(example_batch):
    # Take a list of PIL images and turn them to pixel values
    inputs = feature_extractor([x for x in example_batch['image']], return_tensors='pt')

    # Don't forget to include the labels!
    inputs['labels'] = example_batch['label']
    return inputs

In [62]:
prepared_ds = dataset.with_transform(transform)

In [80]:
from datasets import DatasetDict

train_testvalid = prepared_ds['train'].train_test_split(0.1)
# Split the 10% test + valid in half test, half valid
test_valid = train_testvalid['test'].train_test_split(0.5)
# gather everyone if you want to have a single DatasetDict
train_test_valid_dataset = DatasetDict({
    'train': train_testvalid['train'],
    'test': test_valid['test'],
    'valid': test_valid['train']})

In [81]:
dataset['train'][0:2]

{'path': ['/data/datasets/magic-ml/oxford-iiit-pet/images/Siamese_137.jpg',
  '/data/datasets/magic-ml/oxford-iiit-pet/images/Birman_98.jpg'],
 'label': ['Siamese', 'Birman'],
 'dog': [False, False],
 'image': [<PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=345x500>,
  <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=290x370>]}

## Train and Evaluation

In [125]:
labels = set(dataset['train']['label'])

id2label={str(i): c for i, c in enumerate(labels)}
label2id={c: i for i, c in enumerate(labels)}

In [147]:
def collate_fn(batch):
    return {
        'pixel_values': torch.stack([x['pixel_values'] for x in batch]),
        'labels': torch.nn.functional.one_hot(torch.LongTensor(list(label2id[x['labels']] for x in batch))).double()
    }

In [148]:
from datasets import load_metric
metric = load_metric('accuracy')

def compute_metrics(p):
    return metric.compute(predictions=np.argmax(p.predictions, axis=1), references=p.label_ids)

In [149]:
from transformers import ViTForImageClassification

model = ViTForImageClassification.from_pretrained(
        model_name_or_path,
        num_labels=len(labels),
        id2label=id2label,
        label2id=label2id
)

Some weights of the model checkpoint at google/vit-base-patch16-224-in21k were not used when initializing ViTForImageClassification: ['pooler.dense.weight', 'pooler.dense.bias']
- This IS expected if you are initializing ViTForImageClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ViTForImageClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [143]:
from transformers import TrainingArguments

In [144]:
training_args = TrainingArguments(
    #output_dir='/gs/dldsproject/vit-base',
    './vit-base',
    per_device_train_batch_size=16,
    evaluation_strategy='steps',
    num_train_epochs=4,
    fp16=True,
    save_steps=100,
    eval_steps=100,
    logging_steps=10,
    learning_rate=2e-4,
    save_total_limit=2,
    remove_unused_columns=False,
    #push_to_hub=False,
    load_best_model_at_end=True,
)

In [145]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=collate_fn,
    compute_metrics=compute_metrics,
    train_dataset=train_test_valid_dataset["train"],
    eval_dataset=train_test_valid_dataset["valid"],
    tokenizer=feature_extractor,
)

In [146]:
train_results = trainer.train()
trainer.save_model()
trainer.log_metrics("train", train_results.metrics)
trainer.save_metrics("train", train_results.metrics)
trainer.save_state()



[0, 30, 28, 35, 23, 28, 32, 9, 27, 36, 6, 4, 30, 12, 3, 30]


Step,Training Loss,Validation Loss


[21, 23, 13, 31, 28, 28, 5, 23, 14, 5, 23, 16, 29, 16, 7, 34]


OutOfMemoryError: CUDA out of memory. Tried to allocate 20.00 MiB (GPU 0; 6.00 GiB total capacity; 5.00 GiB already allocated; 0 bytes free; 5.33 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF