In [1]:
from datasets import load_dataset
from transformers import ViTFeatureExtractor

In [2]:
!git config --global credential.helper store

In [3]:
from huggingface_hub import login
login(token="<YOUR_HF_TOKEN>", add_to_git_credential=True)

Token is valid.
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [4]:
model_name_or_path = 'google/vit-base-patch16-224-in21k'
feature_extractor = ViTFeatureExtractor.from_pretrained(model_name_or_path)

Downloading (…)cessor_config.json";:   0%|          | 0.00/160 [00:00<?, ?B/s]



In [5]:
ds = load_dataset('pittawat/uppercase-english-characters')

def transform(example_batch):
    inputs = feature_extractor([x for x in example_batch['image']], return_tensors='pt')
    inputs['labels'] = example_batch['label']
    return inputs

Downloading and preparing dataset parquet/pittawat--uppercase-english-characters to /root/.cache/huggingface/datasets/parquet/pittawat--uppercase-english-characters-e0d8f622696babc6/0.0.0/0b6d5799bb726b24ad7fc7be720c170d8e497f575d02d47537de9a5bac074901...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/459k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/4.36M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Dataset parquet downloaded and prepared to /root/.cache/huggingface/datasets/parquet/pittawat--uppercase-english-characters-e0d8f622696babc6/0.0.0/0b6d5799bb726b24ad7fc7be720c170d8e497f575d02d47537de9a5bac074901. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

In [6]:
prepared_ds = ds.with_transform(transform)

In [7]:
import torch

def collate_fn(batch):
    return {
        'pixel_values': torch.stack([x['pixel_values'] for x in batch]),
        'labels': torch.tensor([x['labels'] for x in batch])
    }

In [8]:
import numpy as np
from datasets import load_metric

metric = load_metric("accuracy")
def compute_metrics(p):
    return metric.compute(predictions=np.argmax(p.predictions, axis=1), references=p.label_ids)

Downloading builder script:   0%|          | 0.00/1.41k [00:00<?, ?B/s]

In [9]:
from transformers import ViTForImageClassification

labels = ds['train'].features['label'].names

model = ViTForImageClassification.from_pretrained(
    model_name_or_path,
    num_labels=len(labels),
    id2label={str(i): c for i, c in enumerate(labels)},
    label2id={c: str(i) for i, c in enumerate(labels)}
)

Downloading (…)"config.json";:   0%|          | 0.00/502 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/346M [00:00<?, ?B/s]

Some weights of the model checkpoint at google/vit-base-patch16-224-in21k were not used when initializing ViTForImageClassification: ['pooler.dense.bias', 'pooler.dense.weight']
- This IS expected if you are initializing ViTForImageClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ViTForImageClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
from transformers import TrainingArguments

training_args = TrainingArguments(
  output_dir="./vit-base-uppercase-english-characters",
  per_device_train_batch_size=16,
  evaluation_strategy="steps",
  num_train_epochs=4,
  fp16=True,
  save_steps=100,
  eval_steps=100,
  logging_steps=10,
  learning_rate=2e-4,
  save_total_limit=2,
  remove_unused_columns=False,
  push_to_hub=True,
  report_to='tensorboard',
  load_best_model_at_end=True,
)

In [11]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=collate_fn,
    compute_metrics=compute_metrics,
    train_dataset=prepared_ds["train"],
    eval_dataset=prepared_ds["test"],
    tokenizer=feature_extractor,
)

Cloning https://huggingface.co/pittawat/vit-base-uppercase-english-characters into local empty directory.


Download file pytorch_model.bin:   0%|          | 8.00k/327M [00:00<?, ?B/s]

Download file training_args.bin: 100%|##########| 3.43k/3.43k [00:00<?, ?B/s]

Download file runs/Jan11_08-33-56_458fbaf9acf0/events.out.tfevents.1704962149.458fbaf9acf0.26.2: 100%|########…

Clean file training_args.bin:  29%|##9       | 1.00k/3.43k [00:00<?, ?B/s]

Download file runs/Jan11_08-33-56_458fbaf9acf0/1704962046.9654446/events.out.tfevents.1704962046.458fbaf9acf0.…

Clean file runs/Jan11_08-33-56_458fbaf9acf0/events.out.tfevents.1704962149.458fbaf9acf0.26.2: 100%|##########|…

Clean file runs/Jan11_08-33-56_458fbaf9acf0/1704962046.9654446/events.out.tfevents.1704962046.458fbaf9acf0.26.…

Download file runs/Jan11_08-33-56_458fbaf9acf0/events.out.tfevents.1704962046.458fbaf9acf0.26.0: 100%|########…

Clean file runs/Jan11_08-33-56_458fbaf9acf0/events.out.tfevents.1704962046.458fbaf9acf0.26.0:  14%|#4        |…

Clean file pytorch_model.bin:   0%|          | 1.00k/327M [00:00<?, ?B/s]

Using cuda_amp half precision backend


In [12]:
train_results = trainer.train()
trainer.save_model()
trainer.log_metrics("train", train_results.metrics)
trainer.save_metrics("train", train_results.metrics)
trainer.save_state()

***** Running training *****
  Num examples = 2340
  Num Epochs = 4
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 296
  Number of trainable parameters = 85818650


Step,Training Loss,Validation Loss,Accuracy
100,0.5944,0.553828,0.948718
200,0.2241,0.315962,0.957265


***** Running Evaluation *****
  Num examples = 234
  Batch size = 16
Saving model checkpoint to ./vit-base-uppercase-english-characters/checkpoint-100
Configuration saved in ./vit-base-uppercase-english-characters/checkpoint-100/config.json
Model weights saved in ./vit-base-uppercase-english-characters/checkpoint-100/pytorch_model.bin
Image processor saved in ./vit-base-uppercase-english-characters/checkpoint-100/preprocessor_config.json
Image processor saved in ./vit-base-uppercase-english-characters/preprocessor_config.json
***** Running Evaluation *****
  Num examples = 234
  Batch size = 16
Saving model checkpoint to ./vit-base-uppercase-english-characters/checkpoint-200
Configuration saved in ./vit-base-uppercase-english-characters/checkpoint-200/config.json
Model weights saved in ./vit-base-uppercase-english-characters/checkpoint-200/pytorch_model.bin
Image processor saved in ./vit-base-uppercase-english-characters/checkpoint-200/preprocessor_config.json
Image processor saved in

***** train metrics *****
  epoch                    =         4.0
  total_flos               = 675656873GF
  train_loss               =       0.695
  train_runtime            =  0:03:03.78
  train_samples_per_second =      50.928
  train_steps_per_second   =       1.611


In [13]:
metrics = trainer.evaluate(prepared_ds['test'])
trainer.log_metrics("test", metrics)
trainer.save_metrics("test", metrics)

***** Running Evaluation *****
  Num examples = 234
  Batch size = 16


***** test metrics *****
  epoch                   =        4.0
  eval_accuracy           =     0.9573
  eval_loss               =      0.316
  eval_runtime            = 0:00:01.91
  eval_samples_per_second =    122.237
  eval_steps_per_second   =      7.836


In [14]:
kwargs = {
    "finetuned_from": model.config._name_or_path,
    "tasks": "image-classification",
    "dataset": 'pittawat/uppercase-english-characters',
    "tags": ['image-classification'],
}

if training_args.push_to_hub:
    trainer.push_to_hub('🍻 cheers', **kwargs)
else:
    trainer.create_model_card(**kwargs)

Saving model checkpoint to ./vit-base-uppercase-english-characters
Configuration saved in ./vit-base-uppercase-english-characters/config.json
Model weights saved in ./vit-base-uppercase-english-characters/pytorch_model.bin
Image processor saved in ./vit-base-uppercase-english-characters/preprocessor_config.json
To https://huggingface.co/pittawat/vit-base-uppercase-english-characters
   44bb136..41636df  main -> main

Dropping the following result as it does not have all the necessary fields:
{'task': {'name': 'Image Classification', 'type': 'image-classification'}, 'metrics': [{'name': 'Accuracy', 'type': 'accuracy', 'value': 0.9572649572649573}]}
To https://huggingface.co/pittawat/vit-base-uppercase-english-characters
   41636df..c86e20f  main -> main

