# Finetuning on MIT Indoor Scenes

https://www.kaggle.com/itsahmad/indoor-scenes-cvpr-2019

https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/VisionTransformer/Fine_tuning_the_Vision_Transformer_on_CIFAR_10_with_the_%F0%9F%A4%97_Trainer.ipynb


In [1]:
from transformers import AdamW, ViTFeatureExtractor, ViTForImageClassification
import PIL
import torch
import pprint
from pathlib import Path
import os
import datasets


# Read data

In [2]:
# data_base_path = "/Users/vincent/datasets/mit_indoor_scenes/dataset/indoorCVPR_09/Images/"
# MODEL = "google/vit-base-patch16-224"
MODEL = "google/vit-base-patch16-224-in21k"
data_base_path = "/Users/vincent/datasets/mit_indoor_scenes/sample/"
data_directory = Path(data_base_path)
classes = [ dir_.name for dir_ in data_directory.iterdir()]
class_no = len(classes)
pprint.pformat(classes)

"['library', 'buffet']"

In [3]:
# path = "/Users/vincent/datasets/mit_indoor_scenes/dataset/indoorCVPR_09/Images/airport_inside/airport_inside_0201.jpg"
# image = PIL.Image.open(path)
# feature_extractor = ViTFeatureExtractor.from_pretrained(MODEL)
# inputs = feature_extractor(images=image, return_tensors="pt")
# inputs

## Filter out JPEG Images

In [4]:
# python dicts are ordered since py3.6
# test_files_pixels_map = {str(dir_):  }
test_files_pixels_map_possible = {}
test_files_pixels_map_not_possible = {}
# we only want jpeg type of images to avoid downstream errors
for path in Path(data_base_path).rglob("*"):
    path_as_str = str(path)
    print(path_as_str)
    if os.path.isfile(path):
        pixels = PIL.Image.open(path_as_str)
        if isinstance(pixels, PIL.JpegImagePlugin.JpegImageFile):
            test_files_pixels_map_possible[path_as_str] = pixels
        else:
            test_files_pixels_map_not_possible[path_as_str] = pixels



# for path, pixels in test_files_pixels_map.items():
#     if os.path.isfile(path):
#         if isinstance(pixels, PIL.JpegImagePlugin.JpegImageFile):
#                 test_files_pixels_map_possible[path] = pixels
#         else:
#                 test_files_pixels_map_not_possible[path] = pixels
pixels = list(test_files_pixels_map_possible.values())
paths = list(test_files_pixels_map_possible.keys())

/Users/vincent/datasets/mit_indoor_scenes/sample/library
/Users/vincent/datasets/mit_indoor_scenes/sample/buffet
/Users/vincent/datasets/mit_indoor_scenes/sample/library/biblio_livre.jpg
/Users/vincent/datasets/mit_indoor_scenes/sample/library/ins20.jpg
/Users/vincent/datasets/mit_indoor_scenes/sample/library/int91.jpg
/Users/vincent/datasets/mit_indoor_scenes/sample/library/library05.jpg
/Users/vincent/datasets/mit_indoor_scenes/sample/library/library_journals_books.jpg
/Users/vincent/datasets/mit_indoor_scenes/sample/library/library04.jpg
/Users/vincent/datasets/mit_indoor_scenes/sample/library/fine_arts.jpg
/Users/vincent/datasets/mit_indoor_scenes/sample/library/ins21.jpg
/Users/vincent/datasets/mit_indoor_scenes/sample/library/gallerie_1130426509812_81_80_90_133.jpg
/Users/vincent/datasets/mit_indoor_scenes/sample/library/bibliotheaaak.jpg
/Users/vincent/datasets/mit_indoor_scenes/sample/library/bnf.jpg
/Users/vincent/datasets/mit_indoor_scenes/sample/library/450px_Bibliothek_im_R

## Extract Features

In [5]:

feature_extractor = ViTFeatureExtractor.from_pretrained(MODEL)
batch = feature_extractor(images=pixels, return_tensors="pt")
batch

{'pixel_values': tensor([[[[ 1.0000,  1.0000,  0.5686,  ..., -0.1451, -0.2471, -0.2078],
          [ 1.0000,  0.9843,  0.5137,  ..., -0.0745,  0.1608,  0.4353],
          [ 0.9922,  0.9373,  0.3882,  ...,  0.5765,  0.8118,  0.9216],
          ...,
          [-0.5529, -0.6392, -0.6392,  ...,  0.1529,  0.1373,  0.1216],
          [-0.5922, -0.6078, -0.5294,  ...,  0.1294,  0.1451,  0.1608],
          [-0.4980, -0.4980, -0.4039,  ...,  0.1294,  0.1451,  0.1686]],

         [[ 0.9686,  0.9843,  0.5765,  ..., -0.3020, -0.3961, -0.3490],
          [ 0.9686,  0.9843,  0.5216,  ..., -0.2078,  0.0353,  0.3255],
          [ 0.9686,  0.9373,  0.3961,  ...,  0.4980,  0.7412,  0.8824],
          ...,
          [-0.8039, -0.9059, -0.9137,  ...,  0.0824,  0.0667,  0.0510],
          [-0.8588, -0.8902, -0.8275,  ...,  0.0431,  0.0588,  0.0745],
          [-0.7647, -0.7804, -0.7176,  ...,  0.0353,  0.0510,  0.0745]],

         [[ 0.9765,  0.8980,  0.3333,  ..., -0.7098, -0.7490, -0.6706],
          [ 0

In [6]:
labels = [path.split("/")[-2] for path in paths]
pprint.pformat(set(labels))

"{'library', 'buffet'}"

In [7]:
from sklearn import preprocessing

le = preprocessing.LabelEncoder()
targets = le.fit_transform(labels)
targets = torch.as_tensor(targets)
targets

# add labels to batch
batch['labels'] = targets


In [8]:
# inputs = feature_extractor(image, return_tensors="pt")
# model = ViTForImageClassification.from_pretrained("google/vit-base-patch16-224")
# outputs = model(**batch)
# probabilities = outputs.logits.softmax(-1)[0]
# scores, ids = probabilities.topk(10)
# predictions = [{"score": score.item(), "label": model.config.id2label[_id.item()]} for score, _id in zip(scores, ids)]
# pprint.pprint(predictions)


In [9]:
pixel_values = [ image for image in batch["pixel_values"]]
batch["pixel_values"] = pixel_values
batch

{'pixel_values': [tensor([[[ 1.0000,  1.0000,  0.5686,  ..., -0.1451, -0.2471, -0.2078],
         [ 1.0000,  0.9843,  0.5137,  ..., -0.0745,  0.1608,  0.4353],
         [ 0.9922,  0.9373,  0.3882,  ...,  0.5765,  0.8118,  0.9216],
         ...,
         [-0.5529, -0.6392, -0.6392,  ...,  0.1529,  0.1373,  0.1216],
         [-0.5922, -0.6078, -0.5294,  ...,  0.1294,  0.1451,  0.1608],
         [-0.4980, -0.4980, -0.4039,  ...,  0.1294,  0.1451,  0.1686]],

        [[ 0.9686,  0.9843,  0.5765,  ..., -0.3020, -0.3961, -0.3490],
         [ 0.9686,  0.9843,  0.5216,  ..., -0.2078,  0.0353,  0.3255],
         [ 0.9686,  0.9373,  0.3961,  ...,  0.4980,  0.7412,  0.8824],
         ...,
         [-0.8039, -0.9059, -0.9137,  ...,  0.0824,  0.0667,  0.0510],
         [-0.8588, -0.8902, -0.8275,  ...,  0.0431,  0.0588,  0.0745],
         [-0.7647, -0.7804, -0.7176,  ...,  0.0353,  0.0510,  0.0745]],

        [[ 0.9765,  0.8980,  0.3333,  ..., -0.7098, -0.7490, -0.6706],
         [ 0.9765,  0.8745,

In [10]:
from transformers import TrainingArguments, Trainer

# Training Args
args = TrainingArguments(
    f"mit-indoor-scenes",
    save_strategy="epoch",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=10,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    logging_dir='logs',
    remove_unused_columns=False,
)


# evaluation
from datasets import load_metric
import numpy as np

metric = load_metric("accuracy")
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels)


# model

id2label = {id:label for id, label in zip(targets.tolist(), labels)}
label2id = {label:id for id,label in zip(targets.tolist(), labels )}
num_labels = len(set(labels))


pprint.pprint(f"id2label: {id2label}")
pprint.pprint(f"label2id: {label2id}")
pprint.pprint(f"num labels: {num_labels}")


"id2label: {1: 'library', 0: 'buffet'}"
"label2id: {'library': 1, 'buffet': 0}"
'num labels: 2'


In [11]:


model = ViTForImageClassification.from_pretrained(
            MODEL,
            num_labels=2,
            id2label=id2label,
            label2id=label2id
    )


Some weights of the model checkpoint at google/vit-base-patch16-224-in21k were not used when initializing ViTForImageClassification: ['pooler.dense.bias', 'pooler.dense.weight']
- This IS expected if you are initializing ViTForImageClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ViTForImageClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
batch["pixel_values"]

[tensor([[[ 1.0000,  1.0000,  0.5686,  ..., -0.1451, -0.2471, -0.2078],
          [ 1.0000,  0.9843,  0.5137,  ..., -0.0745,  0.1608,  0.4353],
          [ 0.9922,  0.9373,  0.3882,  ...,  0.5765,  0.8118,  0.9216],
          ...,
          [-0.5529, -0.6392, -0.6392,  ...,  0.1529,  0.1373,  0.1216],
          [-0.5922, -0.6078, -0.5294,  ...,  0.1294,  0.1451,  0.1608],
          [-0.4980, -0.4980, -0.4039,  ...,  0.1294,  0.1451,  0.1686]],
 
         [[ 0.9686,  0.9843,  0.5765,  ..., -0.3020, -0.3961, -0.3490],
          [ 0.9686,  0.9843,  0.5216,  ..., -0.2078,  0.0353,  0.3255],
          [ 0.9686,  0.9373,  0.3961,  ...,  0.4980,  0.7412,  0.8824],
          ...,
          [-0.8039, -0.9059, -0.9137,  ...,  0.0824,  0.0667,  0.0510],
          [-0.8588, -0.8902, -0.8275,  ...,  0.0431,  0.0588,  0.0745],
          [-0.7647, -0.7804, -0.7176,  ...,  0.0353,  0.0510,  0.0745]],
 
         [[ 0.9765,  0.8980,  0.3333,  ..., -0.7098, -0.7490, -0.6706],
          [ 0.9765,  0.8745,

In [13]:
from datasets import Dataset

# from_dict removes the tensor and converts it as a list
dataset = Dataset.from_dict(batch)
dataset

Dataset({
    features: ['pixel_values', 'labels'],
    num_rows: 214
})

In [18]:
from torch.utils.data import DataLoader
import torch

def collate_fn(examples):
    # we convert the list here back as a tensor because we lost this
    # when converting to a "Dataset"
    pixel_values = torch.stack([torch.tensor(example["pixel_values"]) for example in examples])
    labels = torch.tensor([example["labels"] for example in examples])
    return {"pixel_values": pixel_values, "labels": labels}

In [20]:

# trainer
trainer = Trainer(
    model,
    args,
    train_dataset=dataset,
    eval_dataset=dataset,
    data_collator=collate_fn,
    compute_metrics=compute_metrics,
    tokenizer=feature_extractor,
)

trainer.train()

***** Running training *****
  Num examples = 214
  Num Epochs = 3
  Instantaneous batch size per device = 10
  Total train batch size (w. parallel, distributed & accumulation) = 10
  Gradient Accumulation steps = 1
  Total optimization steps = 66
 33%|███▎      | 22/66 [05:19<08:50, 12.06s/it]***** Running Evaluation *****
  Num examples = 214
  Batch size = 4
                                               
 33%|███▎      | 22/66 [07:08<08:50, 12.06s/it]Saving model checkpoint to mit-indoor-scenes/checkpoint-22
Configuration saved in mit-indoor-scenes/checkpoint-22/config.json


{'eval_loss': 0.38900071382522583, 'eval_accuracy': 1.0, 'eval_runtime': 108.7629, 'eval_samples_per_second': 1.968, 'eval_steps_per_second': 0.496, 'epoch': 1.0}


Model weights saved in mit-indoor-scenes/checkpoint-22/pytorch_model.bin
Configuration saved in mit-indoor-scenes/checkpoint-22/preprocessor_config.json
 67%|██████▋   | 44/66 [12:18<04:18, 11.75s/it]***** Running Evaluation *****
  Num examples = 214
  Batch size = 4
                                               
 67%|██████▋   | 44/66 [13:48<04:18, 11.75s/it]Saving model checkpoint to mit-indoor-scenes/checkpoint-44
Configuration saved in mit-indoor-scenes/checkpoint-44/config.json


{'eval_loss': 0.21892030537128448, 'eval_accuracy': 1.0, 'eval_runtime': 90.0387, 'eval_samples_per_second': 2.377, 'eval_steps_per_second': 0.6, 'epoch': 2.0}


Model weights saved in mit-indoor-scenes/checkpoint-44/pytorch_model.bin
Configuration saved in mit-indoor-scenes/checkpoint-44/preprocessor_config.json
100%|██████████| 66/66 [18:24<00:00, 10.62s/it]***** Running Evaluation *****
  Num examples = 214
  Batch size = 4
                                               
100%|██████████| 66/66 [19:55<00:00, 10.62s/it]Saving model checkpoint to mit-indoor-scenes/checkpoint-66
Configuration saved in mit-indoor-scenes/checkpoint-66/config.json


{'eval_loss': 0.17354483902454376, 'eval_accuracy': 1.0, 'eval_runtime': 90.7585, 'eval_samples_per_second': 2.358, 'eval_steps_per_second': 0.595, 'epoch': 3.0}


Model weights saved in mit-indoor-scenes/checkpoint-66/pytorch_model.bin
Configuration saved in mit-indoor-scenes/checkpoint-66/preprocessor_config.json


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from mit-indoor-scenes/checkpoint-22 (score: 1.0).
100%|██████████| 66/66 [19:56<00:00, 18.13s/it]

{'train_runtime': 1196.8297, 'train_samples_per_second': 0.536, 'train_steps_per_second': 0.055, 'train_loss': 0.35254446665445965, 'epoch': 3.0}





TrainOutput(global_step=66, training_loss=0.35254446665445965, metrics={'train_runtime': 1196.8297, 'train_samples_per_second': 0.536, 'train_steps_per_second': 0.055, 'train_loss': 0.35254446665445965, 'epoch': 3.0})

In [None]:
# from transformers import AdamW

# model = ViTForImageClassification.from_pretrained(
#             "google/vit-base-patch16-224",
#     )

# optimizer = AdamW(model.parameters())
# loss = model(**batch).loss
# loss.backward()
# optimizer.step()

In [None]:
# path = "/Users/vincent/datasets/mit_indoor_scenes/sample/library/450px_Bibliothek_im_Reformierten_Kollegium_Debrecen.jpg"
# image = PIL.Image.open(path)
# feature_extractor = ViTFeatureExtractor.from_pretrained("google/vit-base-patch16-224")
# inputs = feature_extractor(images= image, return_tensors="pt")
# outputs = model(**inputs)
# probabilities = outputs.logits.softmax(-1)[0]
# scores, ids = probabilities.topk(10)
# predictions = [{"score": score.item(), "label": model.config.id2label[_id.item()]} for score, _id in zip(scores, ids)]
# pprint.pprint(predictions)

In [None]:
# image.show()