<a href="https://colab.research.google.com/github/ethvedbitdesjan/SummerResearch/blob/main/ViTLarge.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
! pip install datasets transformers



In [2]:
!pip install accelerate -U



In [3]:
from google.colab import drive
drive.mount('content/')

Drive already mounted at content/; to attempt to forcibly remount, call drive.mount("content/", force_remount=True).


In [4]:
%cd content/MyDrive/SummerResearch

/content/content/MyDrive/SummerResearch


In [5]:
from datasets import load_dataset

dataset = load_dataset("imagefolder", data_dir="Dataset/")

Resolving data files:   0%|          | 0/1615 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/539 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/539 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]

In [6]:
dataset

DatasetDict({
    train: Dataset({
        features: ['image', 'label'],
        num_rows: 1615
    })
    validation: Dataset({
        features: ['image', 'label'],
        num_rows: 539
    })
    test: Dataset({
        features: ['image', 'label'],
        num_rows: 539
    })
})

In [7]:
# Critical imports
import os
import numpy as np
import pandas as pd
from PIL import Image
import random
import cv2
import copy
import torch
import torchvision
import torchvision.models as models
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
import torch.optim as optim
from torch.optim import lr_scheduler

In [8]:
import shutil

In [9]:
from transformers import ViTFeatureExtractor

model_name_or_path = 'google/vit-large-patch16-224-in21k'
feature_extractor = ViTFeatureExtractor.from_pretrained(model_name_or_path)



In [10]:
def transform(example_batch):
    # Take a list of PIL images and turn them to pixel values
    inputs = feature_extractor([x for x in example_batch['image']], return_tensors='pt')

    # Don't forget to include the labels!
    inputs['labels'] = example_batch['label']
    return inputs

prepared_ds = dataset.with_transform(transform)

In [11]:
ds = load_dataset("imagefolder", data_dir="Dataset/")
prepared_ds = ds.with_transform(transform)

Resolving data files:   0%|          | 0/1615 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/539 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/539 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]

In [12]:
prepared_ds

DatasetDict({
    train: Dataset({
        features: ['image', 'label'],
        num_rows: 1615
    })
    validation: Dataset({
        features: ['image', 'label'],
        num_rows: 539
    })
    test: Dataset({
        features: ['image', 'label'],
        num_rows: 539
    })
})

In [13]:
import numpy as np
from datasets import load_metric

def collate_fn(batch):
    return {
        'pixel_values': torch.stack([x['pixel_values'] for x in batch]),
        'labels': torch.tensor([x['labels'] for x in batch])
    }

metric = load_metric("accuracy")
def compute_metrics(p):
    return metric.compute(predictions=np.argmax(p.predictions, axis=1), references=p.label_ids)

  metric = load_metric("accuracy")


In [14]:
from transformers import ViTForImageClassification

labels = prepared_ds['train'].features['label'].names

model = ViTForImageClassification.from_pretrained(
    model_name_or_path,
    num_labels=len(labels),
    id2label={str(i): c for i, c in enumerate(labels)},
    label2id={c: str(i) for i, c in enumerate(labels)}
)
NUM_CLASSES = 2
BATCH_SIZE = 16
device = 'cuda' if torch.cuda.is_available() else 'cpu'

Some weights of the model checkpoint at google/vit-large-patch16-224-in21k were not used when initializing ViTForImageClassification: ['pooler.dense.weight', 'pooler.dense.bias']
- This IS expected if you are initializing ViTForImageClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ViTForImageClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-large-patch16-224-in21k and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
from transformers import TrainingArguments

training_args = TrainingArguments(
  output_dir="./vit-large",
  per_device_train_batch_size=8,
  evaluation_strategy="steps",
  num_train_epochs=4,
  fp16=True,
  save_steps=200,
  eval_steps=200,
  logging_steps=10,
  learning_rate=1e-5,
  save_total_limit=2,
  remove_unused_columns=False,
  push_to_hub=False,
  report_to='tensorboard',
  load_best_model_at_end=True,
  optim='adamw_torch',
  weight_decay=0.1
)

In [16]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=collate_fn,
    compute_metrics=compute_metrics,
    train_dataset=prepared_ds["train"],
    eval_dataset=prepared_ds["validation"],
    tokenizer=feature_extractor,
)

In [17]:
train_results = trainer.train()
trainer.save_model()
trainer.log_metrics("train", train_results.metrics)
trainer.save_metrics("train", train_results.metrics)
trainer.save_state()

Step,Training Loss,Validation Loss,Accuracy
200,0.594,0.638344,0.634508
400,0.384,0.629898,0.67718
600,0.1787,0.749544,0.669759
800,0.0615,0.84901,0.667904


***** train metrics *****
  epoch                    =          4.0
  total_flos               = 1648082213GF
  train_loss               =       0.3331
  train_runtime            =   0:15:56.20
  train_samples_per_second =        6.756
  train_steps_per_second   =        0.845


In [18]:
metrics = trainer.evaluate(prepared_ds['validation'])
trainer.log_metrics("eval", metrics)
trainer.save_metrics("eval", metrics)

***** eval metrics *****
  epoch                   =        4.0
  eval_accuracy           =     0.6772
  eval_loss               =     0.6299
  eval_runtime            = 0:00:28.13
  eval_samples_per_second =     19.156
  eval_steps_per_second   =      2.417


In [19]:
metrics = trainer.predict(prepared_ds['test'])
trainer.log_metrics("eval", metrics.metrics)
trainer.save_metrics("eval", metrics.metrics)

***** eval metrics *****
  test_accuracy           =     0.6419
  test_loss               =     0.6458
  test_runtime            = 0:03:44.59
  test_samples_per_second =        2.4
  test_steps_per_second   =      0.303
